howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Load data 77 if load: 78 self.load_data() 79 80 def set_input(self, input: str = None) -> None: 81 """ 82 The function `set_input` takes a file name as input, extracts the name and extension, and sets 83 attributes in the class accordingly. 84 85 :param input: The `set_input` method in the provided code snippet is used to set attributes 86 related to the input file. Here's a breakdown of the parameters and their usage in the method: 87 :type input: str 88 """ 89 90 if input and not isinstance(input, str): 91 try: 92 self.input = input.name 93 except: 94 log.error(f"Input file '{input} in bad format") 95 raise ValueError(f"Input file '{input} in bad format") 96 else: 97 self.input = input 98 99 # Input format 100 if input: 101 input_name, input_extension = os.path.splitext(self.input) 102 self.input_name = input_name 103 self.input_extension = input_extension 104 self.input_format = self.input_extension.replace(".", "") 105 106 def set_config(self, config: dict) -> None: 107 """ 108 The set_config function takes a config object and assigns it as the configuration object for the 109 class. 110 111 :param config: The `config` parameter in the `set_config` function is a dictionary object that 112 contains configuration settings for the class. When you call the `set_config` function with a 113 dictionary object as the argument, it will set that dictionary as the configuration object for 114 the class 115 :type config: dict 116 """ 117 118 self.config = config 119 120 def set_param(self, param: dict) -> None: 121 """ 122 This function sets a parameter object for the class based on the input dictionary. 123 124 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 125 as the `param` attribute of the class instance 126 :type param: dict 127 """ 128 129 self.param = param 130 131 def init_variables(self) -> None: 132 """ 133 This function initializes the variables that will be used in the rest of the class 134 """ 135 136 self.prefix = "howard" 137 self.table_variants = "variants" 138 self.dataframe = None 139 140 self.comparison_map = { 141 "gt": ">", 142 "gte": ">=", 143 "lt": "<", 144 "lte": "<=", 145 "equals": "=", 146 "contains": "SIMILAR TO", 147 } 148 149 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 150 151 self.code_type_map_to_sql = { 152 "Integer": "INTEGER", 153 "String": "VARCHAR", 154 "Float": "FLOAT", 155 "Flag": "VARCHAR", 156 } 157 158 self.index_additionnal_fields = [] 159 160 def get_indexing(self) -> bool: 161 """ 162 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 163 returns False. 164 :return: The value of the indexing parameter. 165 """ 166 167 return self.get_param().get("indexing", False) 168 169 def get_connexion_config(self) -> dict: 170 """ 171 The function `get_connexion_config` returns a dictionary containing the configuration for a 172 connection, including the number of threads and memory limit. 173 :return: a dictionary containing the configuration for the Connexion library. 174 """ 175 176 # config 177 config = self.get_config() 178 179 # Connexion config 180 connexion_config = {} 181 threads = self.get_threads() 182 183 # Threads 184 if threads: 185 connexion_config["threads"] = threads 186 187 # Memory 188 # if config.get("memory", None): 189 # connexion_config["memory_limit"] = config.get("memory") 190 if self.get_memory(): 191 connexion_config["memory_limit"] = self.get_memory() 192 193 # Temporary directory 194 if config.get("tmp", None): 195 connexion_config["temp_directory"] = config.get("tmp") 196 197 # Access 198 if config.get("access", None): 199 access = config.get("access") 200 if access in ["RO"]: 201 access = "READ_ONLY" 202 elif access in ["RW"]: 203 access = "READ_WRITE" 204 connexion_db = self.get_connexion_db() 205 if connexion_db in ":memory:": 206 access = "READ_WRITE" 207 connexion_config["access_mode"] = access 208 209 return connexion_config 210 211 def get_duckdb_settings(self) -> dict: 212 """ 213 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 214 string. 215 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # duckdb settings 222 duckdb_settings_dict = {} 223 if config.get("duckdb_settings", None): 224 duckdb_settings = config.get("duckdb_settings") 225 duckdb_settings = full_path(duckdb_settings) 226 # duckdb setting is a file 227 if os.path.exists(duckdb_settings): 228 with open(duckdb_settings) as json_file: 229 duckdb_settings_dict = yaml.safe_load(json_file) 230 # duckdb settings is a string 231 else: 232 duckdb_settings_dict = json.loads(duckdb_settings) 233 234 return duckdb_settings_dict 235 236 def set_connexion_db(self) -> str: 237 """ 238 The function `set_connexion_db` returns the appropriate database connection string based on the 239 input format and connection type. 240 :return: the value of the variable `connexion_db`. 241 """ 242 243 # Default connexion db 244 default_connexion_db = ":memory:" 245 246 # Find connexion db 247 if self.get_input_format() in ["db", "duckdb"]: 248 connexion_db = self.get_input() 249 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 250 connexion_db = default_connexion_db 251 elif self.get_connexion_type() in ["tmpfile"]: 252 tmp_name = tempfile.mkdtemp( 253 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 254 ) 255 connexion_db = f"{tmp_name}/tmp.db" 256 elif self.get_connexion_type() != "": 257 connexion_db = self.get_connexion_type() 258 else: 259 connexion_db = default_connexion_db 260 261 # Set connexion db 262 self.connexion_db = connexion_db 263 264 return connexion_db 265 266 def set_connexion(self, conn) -> None: 267 """ 268 The function `set_connexion` creates a connection to a database, with options for different 269 database formats and settings. 270 271 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 272 database. If a connection is not provided, a new connection to an in-memory database is created. 273 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 274 sqlite 275 """ 276 277 # Connexion db 278 connexion_db = self.set_connexion_db() 279 280 # Connexion config 281 connexion_config = self.get_connexion_config() 282 283 # Connexion format 284 connexion_format = self.get_config().get("connexion_format", "duckdb") 285 # Set connexion format 286 self.connexion_format = connexion_format 287 288 # Connexion 289 if not conn: 290 if connexion_format in ["duckdb"]: 291 conn = duckdb.connect(connexion_db, config=connexion_config) 292 # duckDB settings 293 duckdb_settings = self.get_duckdb_settings() 294 if duckdb_settings: 295 for setting in duckdb_settings: 296 setting_value = duckdb_settings.get(setting) 297 if isinstance(setting_value, str): 298 setting_value = f"'{setting_value}'" 299 conn.execute(f"PRAGMA {setting}={setting_value};") 300 elif connexion_format in ["sqlite"]: 301 conn = sqlite3.connect(connexion_db) 302 303 # Set connexion 304 self.conn = conn 305 306 # Log 307 log.debug(f"connexion_format: {connexion_format}") 308 log.debug(f"connexion_db: {connexion_db}") 309 log.debug(f"connexion config: {connexion_config}") 310 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 311 312 def set_output(self, output: str = None) -> None: 313 """ 314 The `set_output` function in Python sets the output file based on the input or a specified key 315 in the config file, extracting the output name, extension, and format. 316 317 :param output: The `output` parameter in the `set_output` method is used to specify the name of 318 the output file. If the config file has an 'output' key, the method sets the output to the value 319 of that key. If no output is provided, it sets the output to `None` 320 :type output: str 321 """ 322 323 if output and not isinstance(output, str): 324 self.output = output.name 325 else: 326 self.output = output 327 328 # Output format 329 if self.output: 330 output_name, output_extension = os.path.splitext(self.output) 331 self.output_name = output_name 332 self.output_extension = output_extension 333 self.output_format = self.output_extension.replace(".", "") 334 else: 335 self.output_name = None 336 self.output_extension = None 337 self.output_format = None 338 339 def set_header(self) -> None: 340 """ 341 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 342 """ 343 344 input_file = self.get_input() 345 default_header_list = [ 346 "##fileformat=VCFv4.2", 347 "#CHROM POS ID REF ALT QUAL FILTER INFO", 348 ] 349 350 # Full path 351 input_file = full_path(input_file) 352 353 if input_file: 354 355 input_format = self.get_input_format() 356 input_compressed = self.get_input_compressed() 357 config = self.get_config() 358 header_list = default_header_list 359 if input_format in [ 360 "vcf", 361 "hdr", 362 "tsv", 363 "csv", 364 "psv", 365 "parquet", 366 "db", 367 "duckdb", 368 ]: 369 # header provided in param 370 if config.get("header_file", None): 371 with open(config.get("header_file"), "rt") as f: 372 header_list = self.read_vcf_header(f) 373 # within a vcf file format (header within input file itsself) 374 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 375 # within a compressed vcf file format (.vcf.gz) 376 if input_compressed: 377 with bgzf.open(input_file, "rt") as f: 378 header_list = self.read_vcf_header(f) 379 # within an uncompressed vcf file format (.vcf) 380 else: 381 with open(input_file, "rt") as f: 382 header_list = self.read_vcf_header(f) 383 # header provided in default external file .hdr 384 elif os.path.exists((input_file + ".hdr")): 385 with open(input_file + ".hdr", "rt") as f: 386 header_list = self.read_vcf_header(f) 387 else: 388 try: # Try to get header info fields and file columns 389 390 with tempfile.TemporaryDirectory() as tmpdir: 391 392 # Create database 393 db_for_header = Database(database=input_file) 394 395 # Get header columns for infos fields 396 db_header_from_columns = ( 397 db_for_header.get_header_from_columns() 398 ) 399 400 # Get real columns in the file 401 db_header_columns = db_for_header.get_columns() 402 403 # Write header file 404 header_file_tmp = os.path.join(tmpdir, "header") 405 f = open(header_file_tmp, "w") 406 vcf.Writer(f, db_header_from_columns) 407 f.close() 408 409 # Replace #CHROM line with rel columns 410 header_list = db_for_header.read_header_file( 411 header_file=header_file_tmp 412 ) 413 header_list[-1] = "\t".join(db_header_columns) 414 415 except: 416 417 log.warning( 418 f"No header for file {input_file}. Set as default VCF header" 419 ) 420 header_list = default_header_list 421 422 else: # try for unknown format ? 423 424 log.error(f"Input file format '{input_format}' not available") 425 raise ValueError(f"Input file format '{input_format}' not available") 426 427 if not header_list: 428 header_list = default_header_list 429 430 # header as list 431 self.header_list = header_list 432 433 # header as VCF object 434 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 435 436 else: 437 438 self.header_list = None 439 self.header_vcf = None 440 441 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 442 """ 443 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 444 DataFrame based on the connection format. 445 446 :param query: The `query` parameter in the `get_query_to_df` function is a string that 447 represents the SQL query you want to execute. This query will be used to fetch data from a 448 database and convert it into a pandas DataFrame 449 :type query: str 450 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 451 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 452 function will only fetch up to that number of rows from the database query result. If no limit 453 is specified, 454 :type limit: int 455 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 456 """ 457 458 # Connexion format 459 connexion_format = self.get_connexion_format() 460 461 # Limit in query 462 if limit: 463 pd.set_option("display.max_rows", limit) 464 if connexion_format in ["duckdb"]: 465 df = ( 466 self.conn.execute(query) 467 .fetch_record_batch(limit) 468 .read_next_batch() 469 .to_pandas() 470 ) 471 elif connexion_format in ["sqlite"]: 472 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 473 474 # Full query 475 else: 476 if connexion_format in ["duckdb"]: 477 df = self.conn.execute(query).df() 478 elif connexion_format in ["sqlite"]: 479 df = pd.read_sql_query(query, self.conn) 480 481 return df 482 483 def get_overview(self) -> None: 484 """ 485 The function prints the input, output, config, and dataframe of the current object 486 """ 487 table_variants_from = self.get_table_variants(clause="from") 488 sql_columns = self.get_header_columns_as_sql() 489 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 490 df = self.get_query_to_df(sql_query_export) 491 log.info( 492 "Input: " 493 + str(self.get_input()) 494 + " [" 495 + str(str(self.get_input_format())) 496 + "]" 497 ) 498 log.info( 499 "Output: " 500 + str(self.get_output()) 501 + " [" 502 + str(str(self.get_output_format())) 503 + "]" 504 ) 505 log.info("Config: ") 506 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 507 "\n" 508 ): 509 log.info("\t" + str(d)) 510 log.info("Param: ") 511 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 512 "\n" 513 ): 514 log.info("\t" + str(d)) 515 log.info("Sample list: " + str(self.get_header_sample_list())) 516 log.info("Dataframe: ") 517 for d in str(df).split("\n"): 518 log.info("\t" + str(d)) 519 520 # garbage collector 521 del df 522 gc.collect() 523 524 return None 525 526 def get_stats(self) -> dict: 527 """ 528 The `get_stats` function calculates and returns various statistics of the current object, 529 including information about the input file, variants, samples, header fields, quality, and 530 SNVs/InDels. 531 :return: a dictionary containing various statistics of the current object. The dictionary has 532 the following structure: 533 """ 534 535 # Log 536 log.info(f"Stats Calculation...") 537 538 # table varaints 539 table_variants_from = self.get_table_variants() 540 541 # stats dict 542 stats = {"Infos": {}} 543 544 ### File 545 input_file = self.get_input() 546 stats["Infos"]["Input file"] = input_file 547 548 # Header 549 header_infos = self.get_header().infos 550 header_formats = self.get_header().formats 551 header_infos_list = list(header_infos) 552 header_formats_list = list(header_formats) 553 554 ### Variants 555 556 stats["Variants"] = {} 557 558 # Variants by chr 559 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 560 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 561 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 562 by=["CHROM"], kind="quicksort" 563 ) 564 565 # Total number of variants 566 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 567 568 # Calculate percentage 569 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 570 lambda x: (x / nb_of_variants) 571 ) 572 573 stats["Variants"]["Number of variants by chromosome"] = ( 574 nb_of_variants_by_chrom.to_dict(orient="index") 575 ) 576 577 stats["Infos"]["Number of variants"] = int(nb_of_variants) 578 579 ### Samples 580 581 # Init 582 samples = {} 583 nb_of_samples = 0 584 585 # Check Samples 586 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 587 log.debug(f"Check samples...") 588 for sample in self.get_header_sample_list(): 589 sql_query_samples = f""" 590 SELECT '{sample}' as sample, 591 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 592 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 593 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 594 FROM {table_variants_from} 595 WHERE ( 596 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 597 AND 598 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 599 ) 600 GROUP BY genotype 601 """ 602 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 603 sample_genotype_count = sql_query_genotype_df["count"].sum() 604 if len(sql_query_genotype_df): 605 nb_of_samples += 1 606 samples[f"{sample} - {sample_genotype_count} variants"] = ( 607 sql_query_genotype_df.to_dict(orient="index") 608 ) 609 610 stats["Samples"] = samples 611 stats["Infos"]["Number of samples"] = nb_of_samples 612 613 # # 614 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 615 # stats["Infos"]["Number of samples"] = nb_of_samples 616 # elif nb_of_samples: 617 # stats["Infos"]["Number of samples"] = "not a VCF format" 618 619 ### INFO and FORMAT fields 620 header_types_df = {} 621 header_types_list = { 622 "List of INFO fields": header_infos, 623 "List of FORMAT fields": header_formats, 624 } 625 i = 0 626 for header_type in header_types_list: 627 628 header_type_infos = header_types_list.get(header_type) 629 header_infos_dict = {} 630 631 for info in header_type_infos: 632 633 i += 1 634 header_infos_dict[i] = {} 635 636 # ID 637 header_infos_dict[i]["id"] = info 638 639 # num 640 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 641 if header_type_infos[info].num in genotype_map.keys(): 642 header_infos_dict[i]["Number"] = genotype_map.get( 643 header_type_infos[info].num 644 ) 645 else: 646 header_infos_dict[i]["Number"] = header_type_infos[info].num 647 648 # type 649 if header_type_infos[info].type: 650 header_infos_dict[i]["Type"] = header_type_infos[info].type 651 else: 652 header_infos_dict[i]["Type"] = "." 653 654 # desc 655 if header_type_infos[info].desc != None: 656 header_infos_dict[i]["Description"] = header_type_infos[info].desc 657 else: 658 header_infos_dict[i]["Description"] = "" 659 660 if len(header_infos_dict): 661 header_types_df[header_type] = pd.DataFrame.from_dict( 662 header_infos_dict, orient="index" 663 ).to_dict(orient="index") 664 665 # Stats 666 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 667 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 668 stats["Header"] = header_types_df 669 670 ### QUAL 671 if "QUAL" in self.get_header_columns(): 672 sql_query_qual = f""" 673 SELECT 674 avg(CAST(QUAL AS INTEGER)) AS Average, 675 min(CAST(QUAL AS INTEGER)) AS Minimum, 676 max(CAST(QUAL AS INTEGER)) AS Maximum, 677 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 678 median(CAST(QUAL AS INTEGER)) AS Median, 679 variance(CAST(QUAL AS INTEGER)) AS Variance 680 FROM {table_variants_from} 681 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 682 """ 683 684 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 685 stats["Quality"] = {"Stats": qual} 686 687 ### SNV and InDel 688 689 sql_query_snv = f""" 690 691 SELECT Type, count FROM ( 692 693 SELECT 694 'Total' AS Type, 695 count(*) AS count 696 FROM {table_variants_from} 697 698 UNION 699 700 SELECT 701 'MNV' AS Type, 702 count(*) AS count 703 FROM {table_variants_from} 704 WHERE len(REF) > 1 AND len(ALT) > 1 705 AND len(REF) = len(ALT) 706 707 UNION 708 709 SELECT 710 'InDel' AS Type, 711 count(*) AS count 712 FROM {table_variants_from} 713 WHERE len(REF) > 1 OR len(ALT) > 1 714 AND len(REF) != len(ALT) 715 716 UNION 717 718 SELECT 719 'SNV' AS Type, 720 count(*) AS count 721 FROM {table_variants_from} 722 WHERE len(REF) = 1 AND len(ALT) = 1 723 724 ) 725 726 ORDER BY count DESC 727 728 """ 729 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 730 731 sql_query_snv_substitution = f""" 732 SELECT 733 concat(REF, '>', ALT) AS 'Substitution', 734 count(*) AS count 735 FROM {table_variants_from} 736 WHERE len(REF) = 1 AND len(ALT) = 1 737 GROUP BY REF, ALT 738 ORDER BY count(*) DESC 739 """ 740 snv_substitution = ( 741 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 742 ) 743 stats["Variants"]["Counts"] = snv_indel 744 stats["Variants"]["Substitutions"] = snv_substitution 745 746 return stats 747 748 def stats_to_file(self, file: str = None) -> str: 749 """ 750 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 751 into a JSON object, and writes the JSON object to the specified file. 752 753 :param file: The `file` parameter is a string that represents the file path where the JSON data 754 will be written 755 :type file: str 756 :return: the name of the file that was written to. 757 """ 758 759 # Get stats 760 stats = self.get_stats() 761 762 # Serializing json 763 json_object = json.dumps(stats, indent=4) 764 765 # Writing to sample.json 766 with open(file, "w") as outfile: 767 outfile.write(json_object) 768 769 return file 770 771 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 772 """ 773 The `print_stats` function generates a markdown file and prints the statistics contained in a 774 JSON file in a formatted manner. 775 776 :param output_file: The `output_file` parameter is a string that specifies the path and filename 777 of the output file where the stats will be printed in Markdown format. If no `output_file` is 778 provided, a temporary directory will be created and the stats will be saved in a file named 779 "stats.md" within that 780 :type output_file: str 781 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 782 file where the statistics will be saved. If no value is provided, a temporary directory will be 783 created and a default file name "stats.json" will be used 784 :type json_file: str 785 :return: The function `print_stats` does not return any value. It has a return type annotation 786 of `None`. 787 """ 788 789 # Full path 790 output_file = full_path(output_file) 791 json_file = full_path(json_file) 792 793 with tempfile.TemporaryDirectory() as tmpdir: 794 795 # Files 796 if not output_file: 797 output_file = os.path.join(tmpdir, "stats.md") 798 if not json_file: 799 json_file = os.path.join(tmpdir, "stats.json") 800 801 # Create folders 802 if not os.path.exists(os.path.dirname(output_file)): 803 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 804 if not os.path.exists(os.path.dirname(json_file)): 805 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 806 807 # Create stats JSON file 808 stats_file = self.stats_to_file(file=json_file) 809 810 # Print stats file 811 with open(stats_file) as f: 812 stats = yaml.safe_load(f) 813 814 # Output 815 output_title = [] 816 output_index = [] 817 output = [] 818 819 # Title 820 output_title.append("# HOWARD Stats") 821 822 # Index 823 output_index.append("## Index") 824 825 # Process sections 826 for section in stats: 827 infos = stats.get(section) 828 section_link = "#" + section.lower().replace(" ", "-") 829 output.append(f"## {section}") 830 output_index.append(f"- [{section}]({section_link})") 831 832 if len(infos): 833 for info in infos: 834 try: 835 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 836 is_df = True 837 except: 838 try: 839 df = pd.DataFrame.from_dict( 840 json.loads((infos.get(info))), orient="index" 841 ) 842 is_df = True 843 except: 844 is_df = False 845 if is_df: 846 output.append(f"### {info}") 847 info_link = "#" + info.lower().replace(" ", "-") 848 output_index.append(f" - [{info}]({info_link})") 849 output.append(f"{df.to_markdown(index=False)}") 850 else: 851 output.append(f"- {info}: {infos.get(info)}") 852 else: 853 output.append(f"NA") 854 855 # Write stats in markdown file 856 with open(output_file, "w") as fp: 857 for item in output_title: 858 fp.write("%s\n" % item) 859 for item in output_index: 860 fp.write("%s\n" % item) 861 for item in output: 862 fp.write("%s\n" % item) 863 864 # Output stats in markdown 865 print("") 866 print("\n\n".join(output_title)) 867 print("") 868 print("\n\n".join(output)) 869 print("") 870 871 return None 872 873 def get_input(self) -> str: 874 """ 875 It returns the value of the input variable. 876 :return: The input is being returned. 877 """ 878 return self.input 879 880 def get_input_format(self, input_file: str = None) -> str: 881 """ 882 This function returns the format of the input variable, either from the provided input file or 883 by prompting for input. 884 885 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 886 represents the file path of the input file. If no `input_file` is provided when calling the 887 method, it will default to `None` 888 :type input_file: str 889 :return: The format of the input variable is being returned. 890 """ 891 892 if not input_file: 893 input_file = self.get_input() 894 input_format = get_file_format(input_file) 895 return input_format 896 897 def get_input_compressed(self, input_file: str = None) -> str: 898 """ 899 The function `get_input_compressed` returns the format of the input variable after compressing 900 it. 901 902 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 903 that represents the file path of the input file. If no `input_file` is provided when calling the 904 method, it will default to `None` and the method will then call `self.get_input()` to 905 :type input_file: str 906 :return: The function `get_input_compressed` returns the compressed format of the input 907 variable. 908 """ 909 910 if not input_file: 911 input_file = self.get_input() 912 input_compressed = get_file_compressed(input_file) 913 return input_compressed 914 915 def get_output(self) -> str: 916 """ 917 It returns the output of the neuron. 918 :return: The output of the neural network. 919 """ 920 921 return self.output 922 923 def get_output_format(self, output_file: str = None) -> str: 924 """ 925 The function `get_output_format` returns the format of the input variable or the output file if 926 provided. 927 928 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 929 that represents the file path of the output file. If no `output_file` is provided when calling 930 the method, it will default to the output obtained from the `get_output` method of the class 931 instance. The 932 :type output_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not output_file: 937 output_file = self.get_output() 938 output_format = get_file_format(output_file) 939 940 return output_format 941 942 def get_config(self) -> dict: 943 """ 944 It returns the config 945 :return: The config variable is being returned. 946 """ 947 return self.config 948 949 def get_param(self) -> dict: 950 """ 951 It returns the param 952 :return: The param variable is being returned. 953 """ 954 return self.param 955 956 def get_connexion_db(self) -> str: 957 """ 958 It returns the connexion_db attribute of the object 959 :return: The connexion_db is being returned. 960 """ 961 return self.connexion_db 962 963 def get_prefix(self) -> str: 964 """ 965 It returns the prefix of the object. 966 :return: The prefix is being returned. 967 """ 968 return self.prefix 969 970 def get_table_variants(self, clause: str = "select") -> str: 971 """ 972 This function returns the table_variants attribute of the object 973 974 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 975 defaults to select (optional) 976 :return: The table_variants attribute of the object. 977 """ 978 979 # Access 980 access = self.get_config().get("access", None) 981 982 # Clauses "select", "where", "update" 983 if clause in ["select", "where", "update"]: 984 table_variants = self.table_variants 985 # Clause "from" 986 elif clause in ["from"]: 987 # For Read Only 988 if self.get_input_format() in ["parquet"] and access in ["RO"]: 989 input_file = self.get_input() 990 table_variants = f"'{input_file}' as variants" 991 # For Read Write 992 else: 993 table_variants = f"{self.table_variants} as variants" 994 else: 995 table_variants = self.table_variants 996 return table_variants 997 998 def get_tmp_dir(self) -> str: 999 """ 1000 The function `get_tmp_dir` returns the temporary directory path based on configuration 1001 parameters or a default path. 1002 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1003 configuration, parameters, and a default value of "/tmp". 1004 """ 1005 1006 return get_tmp( 1007 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1008 ) 1009 1010 def get_connexion_type(self) -> str: 1011 """ 1012 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1013 1014 :return: The connexion type is being returned. 1015 """ 1016 return self.get_config().get("connexion_type", "memory") 1017 1018 def get_connexion(self): 1019 """ 1020 It returns the connection object 1021 1022 :return: The connection object. 1023 """ 1024 return self.conn 1025 1026 def close_connexion(self) -> None: 1027 """ 1028 This function closes the connection to the database. 1029 :return: The connection is being closed. 1030 """ 1031 return self.conn.close() 1032 1033 def get_header(self, type: str = "vcf"): 1034 """ 1035 This function returns the header of the VCF file as a list of strings 1036 1037 :param type: the type of header you want to get, defaults to vcf (optional) 1038 :return: The header of the vcf file. 1039 """ 1040 1041 if self.header_vcf: 1042 if type == "vcf": 1043 return self.header_vcf 1044 elif type == "list": 1045 return self.header_list 1046 else: 1047 if type == "vcf": 1048 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1049 return header 1050 elif type == "list": 1051 return vcf_required 1052 1053 def get_header_length(self, file: str = None) -> int: 1054 """ 1055 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1056 line. 1057 1058 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1059 header file. If this argument is provided, the function will read the header from the specified 1060 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1061 :type file: str 1062 :return: the length of the header list, excluding the #CHROM line. 1063 """ 1064 1065 if file: 1066 return len(self.read_vcf_header_file(file=file)) - 1 1067 elif self.get_header(type="list"): 1068 return len(self.get_header(type="list")) - 1 1069 else: 1070 return 0 1071 1072 def get_header_columns(self) -> str: 1073 """ 1074 This function returns the header list of a VCF 1075 1076 :return: The length of the header list. 1077 """ 1078 if self.get_header(): 1079 return self.get_header(type="list")[-1] 1080 else: 1081 return "" 1082 1083 def get_header_columns_as_list(self) -> list: 1084 """ 1085 This function returns the header list of a VCF 1086 1087 :return: The length of the header list. 1088 """ 1089 if self.get_header(): 1090 return self.get_header_columns().strip().split("\t") 1091 else: 1092 return [] 1093 1094 def get_header_columns_as_sql(self) -> str: 1095 """ 1096 This function retruns header length (without #CHROM line) 1097 1098 :return: The length of the header list. 1099 """ 1100 sql_column_list = [] 1101 for col in self.get_header_columns_as_list(): 1102 sql_column_list.append(f'"{col}"') 1103 return ",".join(sql_column_list) 1104 1105 def get_header_sample_list(self) -> list: 1106 """ 1107 This function retruns header length (without #CHROM line) 1108 1109 :return: The length of the header list. 1110 """ 1111 return self.header_vcf.samples 1112 1113 def get_verbose(self) -> bool: 1114 """ 1115 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1116 exist 1117 1118 :return: The value of the key "verbose" in the config dictionary. 1119 """ 1120 return self.get_config().get("verbose", False) 1121 1122 def get_connexion_format(self) -> str: 1123 """ 1124 It returns the connexion format of the object. 1125 :return: The connexion_format is being returned. 1126 """ 1127 connexion_format = self.connexion_format 1128 if connexion_format not in ["duckdb", "sqlite"]: 1129 log.error(f"Unknown connexion format {connexion_format}") 1130 raise ValueError(f"Unknown connexion format {connexion_format}") 1131 else: 1132 return connexion_format 1133 1134 def insert_file_to_table( 1135 self, 1136 file, 1137 columns: str, 1138 header_len: int = 0, 1139 sep: str = "\t", 1140 chunksize: int = 1000000, 1141 ) -> None: 1142 """ 1143 The function reads a file in chunks and inserts each chunk into a table based on the specified 1144 database format. 1145 1146 :param file: The `file` parameter is the file that you want to load into a table. It should be 1147 the path to the file on your system 1148 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1149 should contain the names of the columns in the table where the data will be inserted. The column 1150 names should be separated by commas within the string. For example, if you have columns named 1151 "id", "name 1152 :type columns: str 1153 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1154 the number of lines to skip at the beginning of the file before reading the actual data. This 1155 parameter allows you to skip any header information present in the file before processing the 1156 data, defaults to 0 1157 :type header_len: int (optional) 1158 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1159 separator character that is used in the file being read. In this case, the default separator is 1160 set to `\t`, which represents a tab character. You can change this parameter to a different 1161 separator character if, defaults to \t 1162 :type sep: str (optional) 1163 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1164 when processing the file in chunks. In the provided code snippet, the default value for 1165 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1166 to 1000000 1167 :type chunksize: int (optional) 1168 """ 1169 1170 # Config 1171 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1172 connexion_format = self.get_connexion_format() 1173 1174 log.debug("chunksize: " + str(chunksize)) 1175 1176 if chunksize: 1177 for chunk in pd.read_csv( 1178 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1179 ): 1180 if connexion_format in ["duckdb"]: 1181 sql_insert_into = ( 1182 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1183 ) 1184 self.conn.execute(sql_insert_into) 1185 elif connexion_format in ["sqlite"]: 1186 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1187 1188 def load_data( 1189 self, 1190 input_file: str = None, 1191 drop_variants_table: bool = False, 1192 sample_size: int = 20480, 1193 ) -> None: 1194 """ 1195 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1196 table before loading the data and specify a sample size. 1197 1198 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1199 table 1200 :type input_file: str 1201 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1202 determines whether the variants table should be dropped before loading the data. If set to 1203 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1204 not be dropped, defaults to False 1205 :type drop_variants_table: bool (optional) 1206 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1207 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1208 20480 1209 :type sample_size: int (optional) 1210 """ 1211 1212 log.info("Loading...") 1213 1214 # change input file 1215 if input_file: 1216 self.set_input(input_file) 1217 self.set_header() 1218 1219 # drop variants table 1220 if drop_variants_table: 1221 self.drop_variants_table() 1222 1223 # get table variants 1224 table_variants = self.get_table_variants() 1225 1226 # Access 1227 access = self.get_config().get("access", None) 1228 log.debug(f"access: {access}") 1229 1230 # Input format and compress 1231 input_format = self.get_input_format() 1232 input_compressed = self.get_input_compressed() 1233 log.debug(f"input_format: {input_format}") 1234 log.debug(f"input_compressed: {input_compressed}") 1235 1236 # input_compressed_format 1237 if input_compressed: 1238 input_compressed_format = "gzip" 1239 else: 1240 input_compressed_format = "none" 1241 log.debug(f"input_compressed_format: {input_compressed_format}") 1242 1243 # Connexion format 1244 connexion_format = self.get_connexion_format() 1245 1246 # Sample size 1247 if not sample_size: 1248 sample_size = -1 1249 log.debug(f"sample_size: {sample_size}") 1250 1251 # Load data 1252 log.debug(f"Load Data from {input_format}") 1253 1254 # DuckDB connexion 1255 if connexion_format in ["duckdb"]: 1256 1257 # Database already exists 1258 if self.input_format in ["db", "duckdb"]: 1259 1260 if connexion_format in ["duckdb"]: 1261 log.debug(f"Input file format '{self.input_format}' duckDB") 1262 else: 1263 log.error( 1264 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1265 ) 1266 raise ValueError( 1267 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1268 ) 1269 1270 # Load from existing database format 1271 else: 1272 1273 try: 1274 # Create Table or View 1275 database = Database(database=self.input) 1276 sql_from = database.get_sql_from(sample_size=sample_size) 1277 1278 if access in ["RO"]: 1279 sql_load = ( 1280 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1281 ) 1282 else: 1283 sql_load = ( 1284 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1285 ) 1286 self.conn.execute(sql_load) 1287 1288 except: 1289 # Format not available 1290 log.error(f"Input file format '{self.input_format}' not available") 1291 raise ValueError( 1292 f"Input file format '{self.input_format}' not available" 1293 ) 1294 1295 # SQLite connexion 1296 elif connexion_format in ["sqlite"] and input_format in [ 1297 "vcf", 1298 "tsv", 1299 "csv", 1300 "psv", 1301 ]: 1302 1303 # Main structure 1304 structure = { 1305 "#CHROM": "VARCHAR", 1306 "POS": "INTEGER", 1307 "ID": "VARCHAR", 1308 "REF": "VARCHAR", 1309 "ALT": "VARCHAR", 1310 "QUAL": "VARCHAR", 1311 "FILTER": "VARCHAR", 1312 "INFO": "VARCHAR", 1313 } 1314 1315 # Strcuture with samples 1316 structure_complete = structure 1317 if self.get_header_sample_list(): 1318 structure["FORMAT"] = "VARCHAR" 1319 for sample in self.get_header_sample_list(): 1320 structure_complete[sample] = "VARCHAR" 1321 1322 # Columns list for create and insert 1323 sql_create_table_columns = [] 1324 sql_create_table_columns_list = [] 1325 for column in structure_complete: 1326 column_type = structure_complete[column] 1327 sql_create_table_columns.append( 1328 f'"{column}" {column_type} default NULL' 1329 ) 1330 sql_create_table_columns_list.append(f'"{column}"') 1331 1332 # Create database 1333 log.debug(f"Create Table {table_variants}") 1334 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1335 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1336 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1337 self.conn.execute(sql_create_table) 1338 1339 # chunksize define length of file chunk load file 1340 chunksize = 100000 1341 1342 # delimiter 1343 delimiter = file_format_delimiters.get(input_format, "\t") 1344 1345 # Load the input file 1346 with open(self.input, "rt") as input_file: 1347 1348 # Use the appropriate file handler based on the input format 1349 if input_compressed: 1350 input_file = bgzf.open(self.input, "rt") 1351 if input_format in ["vcf"]: 1352 header_len = self.get_header_length() 1353 else: 1354 header_len = 0 1355 1356 # Insert the file contents into a table 1357 self.insert_file_to_table( 1358 input_file, 1359 columns=sql_create_table_columns_list_sql, 1360 header_len=header_len, 1361 sep=delimiter, 1362 chunksize=chunksize, 1363 ) 1364 1365 else: 1366 log.error( 1367 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1368 ) 1369 raise ValueError( 1370 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1371 ) 1372 1373 # Explode INFOS fields into table fields 1374 if self.get_explode_infos(): 1375 self.explode_infos( 1376 prefix=self.get_explode_infos_prefix(), 1377 fields=self.get_explode_infos_fields(), 1378 force=True, 1379 ) 1380 1381 # Create index after insertion 1382 self.create_indexes() 1383 1384 def get_explode_infos(self) -> bool: 1385 """ 1386 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1387 to False if it is not set. 1388 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1389 value. If the parameter is not present, it will return False. 1390 """ 1391 1392 return self.get_param().get("explode", {}).get("explode_infos", False) 1393 1394 def get_explode_infos_fields( 1395 self, 1396 explode_infos_fields: str = None, 1397 remove_fields_not_in_header: bool = False, 1398 ) -> list: 1399 """ 1400 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1401 the input parameter `explode_infos_fields`. 1402 1403 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1404 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1405 comma-separated list of field names to explode 1406 :type explode_infos_fields: str 1407 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1408 flag that determines whether to remove fields that are not present in the header. If it is set 1409 to `True`, any field that is not in the header will be excluded from the list of exploded 1410 information fields. If it is set to `, defaults to False 1411 :type remove_fields_not_in_header: bool (optional) 1412 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1413 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1414 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1415 Otherwise, it returns a list of exploded information fields after removing any spaces and 1416 splitting the string by commas. 1417 """ 1418 1419 # If no fields, get it in param 1420 if not explode_infos_fields: 1421 explode_infos_fields = ( 1422 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1423 ) 1424 1425 # If no fields, defined as all fields in header using keyword 1426 if not explode_infos_fields: 1427 explode_infos_fields = "*" 1428 1429 # If fields list not empty 1430 if explode_infos_fields: 1431 1432 # Input fields list 1433 if isinstance(explode_infos_fields, str): 1434 fields_input = explode_infos_fields.split(",") 1435 elif isinstance(explode_infos_fields, list): 1436 fields_input = explode_infos_fields 1437 else: 1438 fields_input = [] 1439 1440 # Fields list without * keyword 1441 fields_without_all = fields_input.copy() 1442 if "*".casefold() in (item.casefold() for item in fields_without_all): 1443 fields_without_all.remove("*") 1444 1445 # Fields in header 1446 fields_in_header = sorted(list(set(self.get_header().infos))) 1447 1448 # Construct list of fields 1449 fields_output = [] 1450 for field in fields_input: 1451 1452 # Strip field 1453 field = field.strip() 1454 1455 # format keyword * in regex 1456 if field.upper() in ["*"]: 1457 field = ".*" 1458 1459 # Find all fields with pattern 1460 r = re.compile(field) 1461 fields_search = sorted(list(filter(r.match, fields_in_header))) 1462 1463 # Remove fields input from search 1464 if fields_search != [field]: 1465 fields_search = sorted( 1466 list(set(fields_search).difference(fields_input)) 1467 ) 1468 1469 # If field is not in header (avoid not well formatted header) 1470 if not fields_search and not remove_fields_not_in_header: 1471 fields_search = [field] 1472 1473 # Add found fields 1474 for new_field in fields_search: 1475 # Add field, if not already exists, and if it is in header (if asked) 1476 if ( 1477 new_field not in fields_output 1478 and ( 1479 not remove_fields_not_in_header 1480 or new_field in fields_in_header 1481 ) 1482 and new_field not in [".*"] 1483 ): 1484 fields_output.append(new_field) 1485 1486 return fields_output 1487 1488 else: 1489 1490 return [] 1491 1492 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1493 """ 1494 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1495 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1496 not provided. 1497 1498 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1499 prefix to be used for exploding or expanding information 1500 :type explode_infos_prefix: str 1501 :return: the value of the variable `explode_infos_prefix`. 1502 """ 1503 1504 if not explode_infos_prefix: 1505 explode_infos_prefix = ( 1506 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1507 ) 1508 1509 return explode_infos_prefix 1510 1511 def add_column( 1512 self, 1513 table_name, 1514 column_name, 1515 column_type, 1516 default_value=None, 1517 drop: bool = False, 1518 ) -> dict: 1519 """ 1520 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1521 doesn't already exist. 1522 1523 :param table_name: The name of the table to which you want to add a column 1524 :param column_name: The parameter "column_name" is the name of the column that you want to add 1525 to the table 1526 :param column_type: The `column_type` parameter specifies the data type of the column that you 1527 want to add to the table. It should be a string that represents the desired data type, such as 1528 "INTEGER", "TEXT", "REAL", etc 1529 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1530 default value for the newly added column. If a default value is provided, it will be assigned to 1531 the column for any existing rows that do not have a value for that column 1532 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1533 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1534 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1535 to False 1536 :type drop: bool (optional) 1537 :return: a boolean value indicating whether the column was successfully added to the table. 1538 """ 1539 1540 # added 1541 added = False 1542 dropped = False 1543 1544 # Check if the column already exists in the table 1545 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1546 columns = self.get_query_to_df(query).columns.tolist() 1547 if column_name in columns: 1548 log.debug( 1549 f"The {column_name} column already exists in the {table_name} table" 1550 ) 1551 if drop: 1552 self.drop_column(table_name=table_name, column_name=column_name) 1553 dropped = True 1554 else: 1555 return None 1556 else: 1557 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1558 1559 # Add column in table 1560 add_column_query = ( 1561 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1562 ) 1563 if default_value is not None: 1564 add_column_query += f" DEFAULT {default_value}" 1565 self.execute_query(add_column_query) 1566 added = not dropped 1567 log.debug( 1568 f"The {column_name} column was successfully added to the {table_name} table" 1569 ) 1570 1571 if added: 1572 added_column = { 1573 "table_name": table_name, 1574 "column_name": column_name, 1575 "column_type": column_type, 1576 "default_value": default_value, 1577 } 1578 else: 1579 added_column = None 1580 1581 return added_column 1582 1583 def drop_column( 1584 self, column: dict = None, table_name: str = None, column_name: str = None 1585 ) -> bool: 1586 """ 1587 The `drop_column` function drops a specified column from a given table in a database and returns 1588 True if the column was successfully dropped, and False if the column does not exist in the 1589 table. 1590 1591 :param column: The `column` parameter is a dictionary that contains information about the column 1592 you want to drop. It has two keys: 1593 :type column: dict 1594 :param table_name: The `table_name` parameter is the name of the table from which you want to 1595 drop a column 1596 :type table_name: str 1597 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1598 from the table 1599 :type column_name: str 1600 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1601 and False if the column does not exist in the table. 1602 """ 1603 1604 # Find column infos 1605 if column: 1606 if isinstance(column, dict): 1607 table_name = column.get("table_name", None) 1608 column_name = column.get("column_name", None) 1609 elif isinstance(column, str): 1610 table_name = self.get_table_variants() 1611 column_name = column 1612 else: 1613 table_name = None 1614 column_name = None 1615 1616 if not table_name and not column_name: 1617 return False 1618 1619 # Removed 1620 removed = False 1621 1622 # Check if the column already exists in the table 1623 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1624 columns = self.get_query_to_df(query).columns.tolist() 1625 if column_name in columns: 1626 log.debug(f"The {column_name} column exists in the {table_name} table") 1627 else: 1628 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1629 return False 1630 1631 # Add column in table # ALTER TABLE integers DROP k 1632 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1633 self.execute_query(add_column_query) 1634 removed = True 1635 log.debug( 1636 f"The {column_name} column was successfully dropped to the {table_name} table" 1637 ) 1638 1639 return removed 1640 1641 def explode_infos( 1642 self, 1643 prefix: str = None, 1644 create_index: bool = False, 1645 fields: list = None, 1646 force: bool = False, 1647 proccess_all_fields_together: bool = False, 1648 ) -> list: 1649 """ 1650 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1651 columns, returning a list of added columns. 1652 1653 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1654 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1655 `self.get_explode_infos_prefix()` as the prefix 1656 :type prefix: str 1657 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1658 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1659 `False`, indexes will not be created. The default value is `False`, defaults to False 1660 :type create_index: bool (optional) 1661 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1662 individual columns. If this parameter is not provided, all INFO fields will be exploded 1663 :type fields: list 1664 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1665 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1666 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1667 defaults to False 1668 :type force: bool (optional) 1669 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1670 flag that determines whether to process all the INFO fields together or individually. If set to 1671 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1672 be processed individually, defaults to False 1673 :type proccess_all_fields_together: bool (optional) 1674 :return: The function `explode_infos` returns a list of added columns. 1675 """ 1676 1677 # drop indexes 1678 self.drop_indexes() 1679 1680 # connexion format 1681 connexion_format = self.get_connexion_format() 1682 1683 # Access 1684 access = self.get_config().get("access", None) 1685 1686 # Added columns 1687 added_columns = [] 1688 1689 if access not in ["RO"]: 1690 1691 # prefix 1692 if prefix in [None, True] or not isinstance(prefix, str): 1693 if self.get_explode_infos_prefix() not in [None, True]: 1694 prefix = self.get_explode_infos_prefix() 1695 else: 1696 prefix = "INFO/" 1697 1698 # table variants 1699 table_variants = self.get_table_variants(clause="select") 1700 1701 # extra infos 1702 try: 1703 extra_infos = self.get_extra_infos() 1704 except: 1705 extra_infos = [] 1706 1707 # Header infos 1708 header_infos = self.get_header().infos 1709 1710 log.debug( 1711 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1712 ) 1713 1714 sql_info_alter_table_array = [] 1715 1716 # Info fields to check 1717 fields_list = list(header_infos) 1718 if fields: 1719 fields_list += fields 1720 fields_list = set(fields_list) 1721 1722 # If no fields 1723 if not fields: 1724 fields = [] 1725 1726 # Translate fields if patterns 1727 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1728 1729 for info in fields: 1730 1731 info_id_sql = prefix + info 1732 1733 if ( 1734 info in fields_list 1735 or prefix + info in fields_list 1736 or info in extra_infos 1737 ): 1738 1739 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1740 1741 if info in header_infos: 1742 info_type = header_infos[info].type 1743 info_num = header_infos[info].num 1744 else: 1745 info_type = "String" 1746 info_num = 0 1747 1748 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1749 if info_num != 1: 1750 type_sql = "VARCHAR" 1751 1752 # Add field 1753 added_column = self.add_column( 1754 table_name=table_variants, 1755 column_name=info_id_sql, 1756 column_type=type_sql, 1757 default_value="null", 1758 drop=force, 1759 ) 1760 1761 if added_column: 1762 added_columns.append(added_column) 1763 1764 if added_column or force: 1765 1766 # add field to index 1767 self.index_additionnal_fields.append(info_id_sql) 1768 1769 # Update field array 1770 if connexion_format in ["duckdb"]: 1771 update_info_field = f""" 1772 "{info_id_sql}" = 1773 CASE 1774 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1775 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1776 END 1777 """ 1778 elif connexion_format in ["sqlite"]: 1779 update_info_field = f""" 1780 "{info_id_sql}" = 1781 CASE 1782 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1783 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1784 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1785 END 1786 """ 1787 1788 sql_info_alter_table_array.append(update_info_field) 1789 1790 if sql_info_alter_table_array: 1791 1792 # By chromosomes 1793 try: 1794 chromosomes_list = list( 1795 self.get_query_to_df( 1796 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1797 )["#CHROM"] 1798 ) 1799 except: 1800 chromosomes_list = [None] 1801 1802 for chrom in chromosomes_list: 1803 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1804 1805 # Where clause 1806 where_clause = "" 1807 if chrom and len(chromosomes_list) > 1: 1808 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1809 1810 # Update table 1811 if proccess_all_fields_together: 1812 sql_info_alter_table_array_join = ", ".join( 1813 sql_info_alter_table_array 1814 ) 1815 if sql_info_alter_table_array_join: 1816 sql_info_alter_table = f""" 1817 UPDATE {table_variants} 1818 SET {sql_info_alter_table_array_join} 1819 {where_clause} 1820 """ 1821 log.debug( 1822 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1823 ) 1824 # log.debug(sql_info_alter_table) 1825 self.conn.execute(sql_info_alter_table) 1826 else: 1827 sql_info_alter_num = 0 1828 for sql_info_alter in sql_info_alter_table_array: 1829 sql_info_alter_num += 1 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 1841 # create indexes 1842 if create_index: 1843 self.create_indexes() 1844 1845 return added_columns 1846 1847 def create_indexes(self) -> None: 1848 """ 1849 Create indexes on the table after insertion 1850 """ 1851 1852 # Access 1853 access = self.get_config().get("access", None) 1854 1855 # get table variants 1856 table_variants = self.get_table_variants("FROM") 1857 1858 if self.get_indexing() and access not in ["RO"]: 1859 # Create index 1860 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1861 self.conn.execute(sql_create_table_index) 1862 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1863 self.conn.execute(sql_create_table_index) 1864 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1865 self.conn.execute(sql_create_table_index) 1866 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1867 self.conn.execute(sql_create_table_index) 1868 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1869 self.conn.execute(sql_create_table_index) 1870 for field in self.index_additionnal_fields: 1871 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1872 self.conn.execute(sql_create_table_index) 1873 1874 def drop_indexes(self) -> None: 1875 """ 1876 Create indexes on the table after insertion 1877 """ 1878 1879 # Access 1880 access = self.get_config().get("access", None) 1881 1882 # get table variants 1883 table_variants = self.get_table_variants("FROM") 1884 1885 # Get database format 1886 connexion_format = self.get_connexion_format() 1887 1888 if access not in ["RO"]: 1889 if connexion_format in ["duckdb"]: 1890 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1891 elif connexion_format in ["sqlite"]: 1892 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1893 1894 list_indexes = self.conn.execute(sql_list_indexes) 1895 index_names = [row[0] for row in list_indexes.fetchall()] 1896 for index in index_names: 1897 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1898 self.conn.execute(sql_drop_table_index) 1899 1900 def read_vcf_header(self, f) -> list: 1901 """ 1902 It reads the header of a VCF file and returns a list of the header lines 1903 1904 :param f: the file object 1905 :return: The header lines of the VCF file. 1906 """ 1907 1908 header_list = [] 1909 for line in f: 1910 header_list.append(line) 1911 if line.startswith("#CHROM"): 1912 break 1913 return header_list 1914 1915 def read_vcf_header_file(self, file: str = None) -> list: 1916 """ 1917 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1918 uncompressed files. 1919 1920 :param file: The `file` parameter is a string that represents the path to the VCF header file 1921 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1922 default to `None` 1923 :type file: str 1924 :return: The function `read_vcf_header_file` returns a list. 1925 """ 1926 1927 if self.get_input_compressed(input_file=file): 1928 with bgzf.open(file, "rt") as f: 1929 return self.read_vcf_header(f=f) 1930 else: 1931 with open(file, "rt") as f: 1932 return self.read_vcf_header(f=f) 1933 1934 def execute_query(self, query: str): 1935 """ 1936 It takes a query as an argument, executes it, and returns the results 1937 1938 :param query: The query to be executed 1939 :return: The result of the query is being returned. 1940 """ 1941 if query: 1942 return self.conn.execute(query) # .fetchall() 1943 else: 1944 return None 1945 1946 def export_output( 1947 self, 1948 output_file: str | None = None, 1949 output_header: str | None = None, 1950 export_header: bool = True, 1951 query: str | None = None, 1952 parquet_partitions: list | None = None, 1953 chunk_size: int | None = None, 1954 threads: int | None = None, 1955 sort: bool = False, 1956 index: bool = False, 1957 order_by: str | None = None, 1958 ) -> bool: 1959 """ 1960 The `export_output` function exports data from a VCF file to a specified output file in various 1961 formats, including VCF, CSV, TSV, PSV, and Parquet. 1962 1963 :param output_file: The `output_file` parameter is a string that specifies the name of the 1964 output file to be generated by the function. This is where the exported data will be saved 1965 :type output_file: str 1966 :param output_header: The `output_header` parameter is a string that specifies the name of the 1967 file where the header of the VCF file will be exported. If this parameter is not provided, the 1968 header will be exported to a file with the same name as the `output_file` parameter, but with 1969 the extension " 1970 :type output_header: str 1971 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1972 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1973 True, the header will be exported to a file. If `export_header` is False, the header will not 1974 be, defaults to True, if output format is not VCF 1975 :type export_header: bool (optional) 1976 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1977 select specific data from the VCF file before exporting it. If provided, only the data that 1978 matches the query will be exported 1979 :type query: str 1980 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1981 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1982 organize data in a hierarchical directory structure based on the values of one or more columns. 1983 This can improve query performance when working with large datasets 1984 :type parquet_partitions: list 1985 :param chunk_size: The `chunk_size` parameter specifies the number of 1986 records in batch when exporting data in Parquet format. This parameter is used for 1987 partitioning the Parquet file into multiple files. 1988 :type chunk_size: int 1989 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1990 threads to be used during the export process. It determines the level of parallelism and can 1991 improve the performance of the export operation. If not provided, the function will use the 1992 default number of threads 1993 :type threads: int 1994 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1995 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1996 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1997 False 1998 :type sort: bool (optional) 1999 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2000 created on the output file. If `index` is True, an index will be created. If `index` is False, 2001 no index will be created. The default value is False, defaults to False 2002 :type index: bool (optional) 2003 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2004 sorting the output file. This parameter is only applicable when exporting data in VCF format 2005 :type order_by: str 2006 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2007 None if it doesn't. 2008 """ 2009 2010 # Log 2011 log.info("Exporting...") 2012 2013 # Full path 2014 output_file = full_path(output_file) 2015 output_header = full_path(output_header) 2016 2017 # Config 2018 config = self.get_config() 2019 2020 # Param 2021 param = self.get_param() 2022 2023 # Tmp files to remove 2024 tmp_to_remove = [] 2025 2026 # If no output, get it 2027 if not output_file: 2028 output_file = self.get_output() 2029 2030 # If not threads 2031 if not threads: 2032 threads = self.get_threads() 2033 2034 # Auto header name with extension 2035 if export_header or output_header: 2036 if not output_header: 2037 output_header = f"{output_file}.hdr" 2038 # Export header 2039 self.export_header(output_file=output_file) 2040 2041 # Switch off export header if VCF output 2042 output_file_type = get_file_format(output_file) 2043 if output_file_type in ["vcf"]: 2044 export_header = False 2045 tmp_to_remove.append(output_header) 2046 2047 # Chunk size 2048 if not chunk_size: 2049 chunk_size = config.get("chunk_size", None) 2050 2051 # Parquet partition 2052 if not parquet_partitions: 2053 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2054 if parquet_partitions and isinstance(parquet_partitions, str): 2055 parquet_partitions = parquet_partitions.split(",") 2056 2057 # Order by 2058 if not order_by: 2059 order_by = param.get("export", {}).get("order_by", "") 2060 2061 # Header in output 2062 header_in_output = param.get("export", {}).get("include_header", False) 2063 2064 # Database 2065 database_source = self.get_connexion() 2066 2067 # Connexion format 2068 connexion_format = self.get_connexion_format() 2069 2070 # Explode infos 2071 if self.get_explode_infos(): 2072 self.explode_infos( 2073 prefix=self.get_explode_infos_prefix(), 2074 fields=self.get_explode_infos_fields(), 2075 force=False, 2076 ) 2077 2078 # if connexion_format in ["sqlite"] or query: 2079 if connexion_format in ["sqlite"]: 2080 2081 # Export in Parquet 2082 random_tmp = "".join( 2083 random.choice(string.ascii_lowercase) for i in range(10) 2084 ) 2085 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2086 tmp_to_remove.append(database_source) 2087 2088 # Table Variants 2089 table_variants = self.get_table_variants() 2090 2091 # Create export query 2092 sql_query_export_subquery = f""" 2093 SELECT * FROM {table_variants} 2094 """ 2095 2096 # Write source file 2097 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2098 2099 # Create database 2100 database = Database( 2101 database=database_source, 2102 table="variants", 2103 header_file=output_header, 2104 conn_config=self.get_connexion_config(), 2105 ) 2106 2107 # Existing colomns header 2108 # existing_columns_header = database.get_header_file_columns(output_header) 2109 existing_columns_header = database.get_header_columns_from_database() 2110 2111 # Export file 2112 database.export( 2113 output_database=output_file, 2114 output_header=output_header, 2115 existing_columns_header=existing_columns_header, 2116 parquet_partitions=parquet_partitions, 2117 chunk_size=chunk_size, 2118 threads=threads, 2119 sort=sort, 2120 index=index, 2121 header_in_output=header_in_output, 2122 order_by=order_by, 2123 query=query, 2124 export_header=export_header, 2125 ) 2126 2127 # Remove 2128 remove_if_exists(tmp_to_remove) 2129 2130 return (os.path.exists(output_file) or None) and ( 2131 os.path.exists(output_file) or None 2132 ) 2133 2134 def get_extra_infos(self, table: str = None) -> list: 2135 """ 2136 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2137 in the header. 2138 2139 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2140 name of the table from which you want to retrieve the extra columns that are not present in the 2141 header. If the `table` parameter is not provided when calling the function, it will default to 2142 using the variants 2143 :type table: str 2144 :return: A list of columns that are in the specified table but not in the header of the table. 2145 """ 2146 2147 header_columns = [] 2148 2149 if not table: 2150 table = self.get_table_variants(clause="from") 2151 header_columns = self.get_header_columns() 2152 2153 # Check all columns in the database 2154 query = f""" SELECT * FROM {table} LIMIT 1 """ 2155 log.debug(f"query {query}") 2156 table_columns = self.get_query_to_df(query).columns.tolist() 2157 extra_columns = [] 2158 2159 # Construct extra infos (not in header) 2160 for column in table_columns: 2161 if column not in header_columns: 2162 extra_columns.append(column) 2163 2164 return extra_columns 2165 2166 def get_extra_infos_sql(self, table: str = None) -> str: 2167 """ 2168 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2169 by double quotes 2170 2171 :param table: The name of the table to get the extra infos from. If None, the default table is 2172 used 2173 :type table: str 2174 :return: A string of the extra infos 2175 """ 2176 2177 return ", ".join( 2178 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2179 ) 2180 2181 def export_header( 2182 self, 2183 header_name: str = None, 2184 output_file: str = None, 2185 output_file_ext: str = ".hdr", 2186 clean_header: bool = True, 2187 remove_chrom_line: bool = False, 2188 ) -> str: 2189 """ 2190 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2191 specified options, and writes it to a new file. 2192 2193 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2194 this parameter is not specified, the header will be written to the output file 2195 :type header_name: str 2196 :param output_file: The `output_file` parameter in the `export_header` function is used to 2197 specify the name of the output file where the header will be written. If this parameter is not 2198 provided, the header will be written to a temporary file 2199 :type output_file: str 2200 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2201 string that represents the extension of the output header file. By default, it is set to ".hdr" 2202 if not specified by the user. This extension will be appended to the `output_file` name to 2203 create the final, defaults to .hdr 2204 :type output_file_ext: str (optional) 2205 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2206 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2207 `True`, the function will clean the header by modifying certain lines based on a specific 2208 pattern. If `clean_header`, defaults to True 2209 :type clean_header: bool (optional) 2210 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2211 boolean flag that determines whether the #CHROM line should be removed from the header before 2212 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2213 defaults to False 2214 :type remove_chrom_line: bool (optional) 2215 :return: The function `export_header` returns the name of the temporary header file that is 2216 created. 2217 """ 2218 2219 if not header_name and not output_file: 2220 output_file = self.get_output() 2221 2222 if self.get_header(): 2223 2224 # Get header object 2225 header_obj = self.get_header() 2226 2227 # Create database 2228 db_for_header = Database(database=self.get_input()) 2229 2230 # Get real columns in the file 2231 db_header_columns = db_for_header.get_columns() 2232 2233 with tempfile.TemporaryDirectory() as tmpdir: 2234 2235 # Write header file 2236 header_file_tmp = os.path.join(tmpdir, "header") 2237 f = open(header_file_tmp, "w") 2238 vcf.Writer(f, header_obj) 2239 f.close() 2240 2241 # Replace #CHROM line with rel columns 2242 header_list = db_for_header.read_header_file( 2243 header_file=header_file_tmp 2244 ) 2245 header_list[-1] = "\t".join(db_header_columns) 2246 2247 # Remove CHROM line 2248 if remove_chrom_line: 2249 header_list.pop() 2250 2251 # Clean header 2252 if clean_header: 2253 header_list_clean = [] 2254 for head in header_list: 2255 # Clean head for malformed header 2256 head_clean = head 2257 head_clean = re.subn( 2258 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2259 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2260 head_clean, 2261 2, 2262 )[0] 2263 # Write header 2264 header_list_clean.append(head_clean) 2265 header_list = header_list_clean 2266 2267 tmp_header_name = output_file + output_file_ext 2268 2269 f = open(tmp_header_name, "w") 2270 for line in header_list: 2271 f.write(line) 2272 f.close() 2273 2274 return tmp_header_name 2275 2276 def export_variant_vcf( 2277 self, 2278 vcf_file, 2279 remove_info: bool = False, 2280 add_samples: bool = True, 2281 list_samples: list = [], 2282 where_clause: str = "", 2283 index: bool = False, 2284 threads: int | None = None, 2285 ) -> bool | None: 2286 """ 2287 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2288 remove INFO field, add samples, and control compression and indexing. 2289 2290 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2291 written to. It is the output file that will contain the filtered VCF data based on the specified 2292 parameters 2293 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2294 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2295 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2296 in, defaults to False 2297 :type remove_info: bool (optional) 2298 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2299 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2300 If set to False, the samples will be removed. The default value is True, defaults to True 2301 :type add_samples: bool (optional) 2302 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2303 in the output VCF file. By default, all samples will be included. If you provide a list of 2304 samples, only those samples will be included in the output file 2305 :type list_samples: list 2306 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2307 determines whether or not to create an index for the output VCF file. If `index` is set to 2308 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2309 :type index: bool (optional) 2310 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2311 number of threads to use for exporting the VCF file. It determines how many parallel threads 2312 will be used during the export process. More threads can potentially speed up the export process 2313 by utilizing multiple cores of the processor. If 2314 :type threads: int | None 2315 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2316 method with various parameters including the output file, query, threads, sort flag, and index 2317 flag. The `export_output` method is responsible for exporting the VCF data based on the 2318 specified parameters and configurations provided in the `export_variant_vcf` function. 2319 """ 2320 2321 # Config 2322 config = self.get_config() 2323 2324 # Extract VCF 2325 log.debug("Export VCF...") 2326 2327 # Table variants 2328 table_variants = self.get_table_variants() 2329 2330 # Threads 2331 if not threads: 2332 threads = self.get_threads() 2333 2334 # Info fields 2335 if remove_info: 2336 if not isinstance(remove_info, str): 2337 remove_info = "." 2338 info_field = f"""'{remove_info}' as INFO""" 2339 else: 2340 info_field = "INFO" 2341 2342 # Samples fields 2343 if add_samples: 2344 if not list_samples: 2345 list_samples = self.get_header_sample_list() 2346 if list_samples: 2347 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2348 else: 2349 samples_fields = "" 2350 log.debug(f"samples_fields: {samples_fields}") 2351 else: 2352 samples_fields = "" 2353 2354 # Where clause 2355 if where_clause is None: 2356 where_clause = "" 2357 2358 # Variants 2359 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2360 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2361 log.debug(f"sql_query_select={sql_query_select}") 2362 2363 return self.export_output( 2364 output_file=vcf_file, 2365 output_header=None, 2366 export_header=True, 2367 query=sql_query_select, 2368 parquet_partitions=None, 2369 chunk_size=config.get("chunk_size", None), 2370 threads=threads, 2371 sort=True, 2372 index=index, 2373 order_by=None, 2374 ) 2375 2376 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2377 """ 2378 It takes a list of commands and runs them in parallel using the number of threads specified 2379 2380 :param commands: A list of commands to run 2381 :param threads: The number of threads to use, defaults to 1 (optional) 2382 """ 2383 2384 run_parallel_commands(commands, threads) 2385 2386 def get_threads(self, default: int = 1) -> int: 2387 """ 2388 This function returns the number of threads to use for a job, with a default value of 1 if not 2389 specified. 2390 2391 :param default: The `default` parameter in the `get_threads` method is used to specify the 2392 default number of threads to use if no specific value is provided. If no value is provided for 2393 the `threads` parameter in the configuration or input parameters, the `default` value will be 2394 used, defaults to 1 2395 :type default: int (optional) 2396 :return: the number of threads to use for the current job. 2397 """ 2398 2399 # Config 2400 config = self.get_config() 2401 2402 # Param 2403 param = self.get_param() 2404 2405 # Input threads 2406 input_thread = param.get("threads", config.get("threads", None)) 2407 2408 # Check threads 2409 if not input_thread: 2410 threads = default 2411 elif int(input_thread) <= 0: 2412 threads = os.cpu_count() 2413 else: 2414 threads = int(input_thread) 2415 return threads 2416 2417 def get_memory(self, default: str = None) -> str: 2418 """ 2419 This function retrieves the memory value from parameters or configuration with a default value 2420 if not found. 2421 2422 :param default: The `get_memory` function takes in a default value as a string parameter. This 2423 default value is used as a fallback in case the `memory` parameter is not provided in the 2424 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2425 the function 2426 :type default: str 2427 :return: The `get_memory` function returns a string value representing the memory parameter. If 2428 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2429 return the default value provided as an argument to the function. 2430 """ 2431 2432 # Config 2433 config = self.get_config() 2434 2435 # Param 2436 param = self.get_param() 2437 2438 # Input threads 2439 input_memory = param.get("memory", config.get("memory", None)) 2440 2441 # Check threads 2442 if input_memory: 2443 memory = input_memory 2444 else: 2445 memory = default 2446 2447 return memory 2448 2449 def update_from_vcf(self, vcf_file: str) -> None: 2450 """ 2451 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2452 2453 :param vcf_file: the path to the VCF file 2454 """ 2455 2456 connexion_format = self.get_connexion_format() 2457 2458 if connexion_format in ["duckdb"]: 2459 self.update_from_vcf_duckdb(vcf_file) 2460 elif connexion_format in ["sqlite"]: 2461 self.update_from_vcf_sqlite(vcf_file) 2462 2463 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2464 """ 2465 It takes a VCF file and updates the INFO column of the variants table in the database with the 2466 INFO column of the VCF file 2467 2468 :param vcf_file: the path to the VCF file 2469 """ 2470 2471 # varaints table 2472 table_variants = self.get_table_variants() 2473 2474 # Loading VCF into temporaire table 2475 skip = self.get_header_length(file=vcf_file) 2476 vcf_df = pd.read_csv( 2477 vcf_file, 2478 sep="\t", 2479 engine="c", 2480 skiprows=skip, 2481 header=0, 2482 low_memory=False, 2483 ) 2484 sql_query_update = f""" 2485 UPDATE {table_variants} as table_variants 2486 SET INFO = concat( 2487 CASE 2488 WHEN INFO NOT IN ('', '.') 2489 THEN INFO 2490 ELSE '' 2491 END, 2492 ( 2493 SELECT 2494 concat( 2495 CASE 2496 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2497 THEN ';' 2498 ELSE '' 2499 END 2500 , 2501 CASE 2502 WHEN table_parquet.INFO NOT IN ('','.') 2503 THEN table_parquet.INFO 2504 ELSE '' 2505 END 2506 ) 2507 FROM vcf_df as table_parquet 2508 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2509 AND table_parquet.\"POS\" = table_variants.\"POS\" 2510 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2511 AND table_parquet.\"REF\" = table_variants.\"REF\" 2512 AND table_parquet.INFO NOT IN ('','.') 2513 ) 2514 ) 2515 ; 2516 """ 2517 self.conn.execute(sql_query_update) 2518 2519 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2520 """ 2521 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2522 table, then updates the INFO column of the variants table with the INFO column of the temporary 2523 table 2524 2525 :param vcf_file: The path to the VCF file you want to update the database with 2526 """ 2527 2528 # Create a temporary table for the VCF 2529 table_vcf = "tmp_vcf" 2530 sql_create = ( 2531 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2532 ) 2533 self.conn.execute(sql_create) 2534 2535 # Loading VCF into temporaire table 2536 vcf_df = pd.read_csv( 2537 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2538 ) 2539 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2540 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2541 2542 # Update table 'variants' with VCF data 2543 # warning: CONCAT as || operator 2544 sql_query_update = f""" 2545 UPDATE variants as table_variants 2546 SET INFO = CASE 2547 WHEN INFO NOT IN ('', '.') 2548 THEN INFO 2549 ELSE '' 2550 END || 2551 ( 2552 SELECT 2553 CASE 2554 WHEN table_variants.INFO NOT IN ('','.') 2555 AND table_vcf.INFO NOT IN ('','.') 2556 THEN ';' 2557 ELSE '' 2558 END || 2559 CASE 2560 WHEN table_vcf.INFO NOT IN ('','.') 2561 THEN table_vcf.INFO 2562 ELSE '' 2563 END 2564 FROM {table_vcf} as table_vcf 2565 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2566 AND table_vcf.\"POS\" = table_variants.\"POS\" 2567 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2568 AND table_vcf.\"REF\" = table_variants.\"REF\" 2569 ) 2570 """ 2571 self.conn.execute(sql_query_update) 2572 2573 # Drop temporary table 2574 sql_drop = f"DROP TABLE {table_vcf}" 2575 self.conn.execute(sql_drop) 2576 2577 def drop_variants_table(self) -> None: 2578 """ 2579 > This function drops the variants table 2580 """ 2581 2582 table_variants = self.get_table_variants() 2583 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2584 self.conn.execute(sql_table_variants) 2585 2586 def set_variant_id( 2587 self, variant_id_column: str = "variant_id", force: bool = None 2588 ) -> str: 2589 """ 2590 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2591 `#CHROM`, `POS`, `REF`, and `ALT` columns 2592 2593 :param variant_id_column: The name of the column to be created in the variants table, defaults 2594 to variant_id 2595 :type variant_id_column: str (optional) 2596 :param force: If True, the variant_id column will be created even if it already exists 2597 :type force: bool 2598 :return: The name of the column that contains the variant_id 2599 """ 2600 2601 # Assembly 2602 assembly = self.get_param().get( 2603 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2604 ) 2605 2606 # INFO/Tag prefix 2607 prefix = self.get_explode_infos_prefix() 2608 2609 # Explode INFO/SVTYPE 2610 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2611 2612 # variants table 2613 table_variants = self.get_table_variants() 2614 2615 # variant_id column 2616 if not variant_id_column: 2617 variant_id_column = "variant_id" 2618 2619 # Creta variant_id column 2620 if "variant_id" not in self.get_extra_infos() or force: 2621 2622 # Create column 2623 self.add_column( 2624 table_name=table_variants, 2625 column_name=variant_id_column, 2626 column_type="UBIGINT", 2627 default_value="0", 2628 ) 2629 2630 # Update column 2631 self.conn.execute( 2632 f""" 2633 UPDATE {table_variants} 2634 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2635 """ 2636 ) 2637 2638 # Remove added columns 2639 for added_column in added_columns: 2640 self.drop_column(column=added_column) 2641 2642 # return variant_id column name 2643 return variant_id_column 2644 2645 def get_variant_id_column( 2646 self, variant_id_column: str = "variant_id", force: bool = None 2647 ) -> str: 2648 """ 2649 This function returns the variant_id column name 2650 2651 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2652 defaults to variant_id 2653 :type variant_id_column: str (optional) 2654 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2655 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2656 if it is not already set, or if it is set 2657 :type force: bool 2658 :return: The variant_id column name. 2659 """ 2660 2661 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2662 2663 ### 2664 # Annotation 2665 ### 2666 2667 def scan_databases( 2668 self, 2669 database_formats: list = ["parquet"], 2670 database_releases: list = ["current"], 2671 ) -> dict: 2672 """ 2673 The function `scan_databases` scans for available databases based on specified formats and 2674 releases. 2675 2676 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2677 of the databases to be scanned. In this case, the accepted format is "parquet" 2678 :type database_formats: list ["parquet"] 2679 :param database_releases: The `database_releases` parameter is a list that specifies the 2680 releases of the databases to be scanned. In the provided function, the default value for 2681 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2682 databases that are in the "current" 2683 :type database_releases: list 2684 :return: The function `scan_databases` returns a dictionary containing information about 2685 databases that match the specified formats and releases. 2686 """ 2687 2688 # Config 2689 config = self.get_config() 2690 2691 # Param 2692 param = self.get_param() 2693 2694 # Param - Assembly 2695 assembly = param.get("assembly", config.get("assembly", None)) 2696 if not assembly: 2697 assembly = DEFAULT_ASSEMBLY 2698 log.warning(f"Default assembly '{assembly}'") 2699 2700 # Scan for availabled databases 2701 log.info( 2702 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2703 ) 2704 databases_infos_dict = databases_infos( 2705 database_folder_releases=database_releases, 2706 database_formats=database_formats, 2707 assembly=assembly, 2708 config=config, 2709 ) 2710 log.info( 2711 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2712 ) 2713 2714 return databases_infos_dict 2715 2716 def annotation(self) -> None: 2717 """ 2718 It annotates the VCF file with the annotations specified in the config file. 2719 """ 2720 2721 # Config 2722 config = self.get_config() 2723 2724 # Param 2725 param = self.get_param() 2726 2727 # Param - Assembly 2728 assembly = param.get("assembly", config.get("assembly", None)) 2729 if not assembly: 2730 assembly = DEFAULT_ASSEMBLY 2731 log.warning(f"Default assembly '{assembly}'") 2732 2733 # annotations databases folders 2734 annotations_databases = set( 2735 config.get("folders", {}) 2736 .get("databases", {}) 2737 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2738 + config.get("folders", {}) 2739 .get("databases", {}) 2740 .get("parquet", ["~/howard/databases/parquet/current"]) 2741 + config.get("folders", {}) 2742 .get("databases", {}) 2743 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2744 ) 2745 2746 # Get param annotations 2747 if param.get("annotations", None) and isinstance( 2748 param.get("annotations", None), str 2749 ): 2750 log.debug(param.get("annotations", None)) 2751 param_annotation_list = param.get("annotations").split(",") 2752 else: 2753 param_annotation_list = [] 2754 2755 # Each tools param 2756 if param.get("annotation_parquet", None) != None: 2757 log.debug( 2758 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2759 ) 2760 if isinstance(param.get("annotation_parquet", None), list): 2761 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2762 else: 2763 param_annotation_list.append(param.get("annotation_parquet")) 2764 if param.get("annotation_snpsift", None) != None: 2765 if isinstance(param.get("annotation_snpsift", None), list): 2766 param_annotation_list.append( 2767 "snpsift:" 2768 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2769 ) 2770 else: 2771 param_annotation_list.append( 2772 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2773 ) 2774 if param.get("annotation_snpeff", None) != None: 2775 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2776 if param.get("annotation_bcftools", None) != None: 2777 if isinstance(param.get("annotation_bcftools", None), list): 2778 param_annotation_list.append( 2779 "bcftools:" 2780 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2781 ) 2782 else: 2783 param_annotation_list.append( 2784 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2785 ) 2786 if param.get("annotation_annovar", None) != None: 2787 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2788 if param.get("annotation_exomiser", None) != None: 2789 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2790 if param.get("annotation_splice", None) != None: 2791 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2792 2793 # Merge param annotations list 2794 param["annotations"] = ",".join(param_annotation_list) 2795 2796 # debug 2797 log.debug(f"param_annotations={param['annotations']}") 2798 2799 if param.get("annotations"): 2800 2801 # Log 2802 # log.info("Annotations - Check annotation parameters") 2803 2804 if not "annotation" in param: 2805 param["annotation"] = {} 2806 2807 # List of annotations parameters 2808 annotations_list_input = {} 2809 if isinstance(param.get("annotations", None), str): 2810 annotation_file_list = [ 2811 value for value in param.get("annotations", "").split(",") 2812 ] 2813 for annotation_file in annotation_file_list: 2814 annotations_list_input[annotation_file] = {"INFO": None} 2815 else: 2816 annotations_list_input = param.get("annotations", {}) 2817 2818 log.info(f"Quick Annotations:") 2819 for annotation_key in list(annotations_list_input.keys()): 2820 log.info(f" {annotation_key}") 2821 2822 # List of annotations and associated fields 2823 annotations_list = {} 2824 2825 for annotation_file in annotations_list_input: 2826 2827 # Explode annotations if ALL 2828 if ( 2829 annotation_file.upper() == "ALL" 2830 or annotation_file.upper().startswith("ALL:") 2831 ): 2832 2833 # check ALL parameters (formats, releases) 2834 annotation_file_split = annotation_file.split(":") 2835 database_formats = "parquet" 2836 database_releases = "current" 2837 for annotation_file_option in annotation_file_split[1:]: 2838 database_all_options_split = annotation_file_option.split("=") 2839 if database_all_options_split[0] == "format": 2840 database_formats = database_all_options_split[1].split("+") 2841 if database_all_options_split[0] == "release": 2842 database_releases = database_all_options_split[1].split("+") 2843 2844 # Scan for availabled databases 2845 databases_infos_dict = self.scan_databases( 2846 database_formats=database_formats, 2847 database_releases=database_releases, 2848 ) 2849 2850 # Add found databases in annotation parameters 2851 for database_infos in databases_infos_dict.keys(): 2852 annotations_list[database_infos] = {"INFO": None} 2853 2854 else: 2855 annotations_list[annotation_file] = annotations_list_input[ 2856 annotation_file 2857 ] 2858 2859 # Check each databases 2860 if len(annotations_list): 2861 2862 log.info( 2863 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2864 ) 2865 2866 for annotation_file in annotations_list: 2867 2868 # Init 2869 annotations = annotations_list.get(annotation_file, None) 2870 2871 # Annotation snpEff 2872 if annotation_file.startswith("snpeff"): 2873 2874 log.debug(f"Quick Annotation snpEff") 2875 2876 if "snpeff" not in param["annotation"]: 2877 param["annotation"]["snpeff"] = {} 2878 2879 if "options" not in param["annotation"]["snpeff"]: 2880 param["annotation"]["snpeff"]["options"] = "" 2881 2882 # snpEff options in annotations 2883 param["annotation"]["snpeff"]["options"] = "".join( 2884 annotation_file.split(":")[1:] 2885 ) 2886 2887 # Annotation Annovar 2888 elif annotation_file.startswith("annovar"): 2889 2890 log.debug(f"Quick Annotation Annovar") 2891 2892 if "annovar" not in param["annotation"]: 2893 param["annotation"]["annovar"] = {} 2894 2895 if "annotations" not in param["annotation"]["annovar"]: 2896 param["annotation"]["annovar"]["annotations"] = {} 2897 2898 # Options 2899 annotation_file_split = annotation_file.split(":") 2900 for annotation_file_annotation in annotation_file_split[1:]: 2901 if annotation_file_annotation: 2902 param["annotation"]["annovar"]["annotations"][ 2903 annotation_file_annotation 2904 ] = annotations 2905 2906 # Annotation Exomiser 2907 elif annotation_file.startswith("exomiser"): 2908 2909 log.debug(f"Quick Annotation Exomiser") 2910 2911 param["annotation"]["exomiser"] = params_string_to_dict( 2912 annotation_file 2913 ) 2914 2915 # Annotation Splice 2916 elif annotation_file.startswith("splice"): 2917 2918 log.debug(f"Quick Annotation Splice") 2919 2920 param["annotation"]["splice"] = params_string_to_dict( 2921 annotation_file 2922 ) 2923 2924 # Annotation Parquet or BCFTOOLS 2925 else: 2926 2927 # Tools detection 2928 if annotation_file.startswith("bcftools:"): 2929 annotation_tool_initial = "bcftools" 2930 annotation_file = ":".join(annotation_file.split(":")[1:]) 2931 elif annotation_file.startswith("snpsift:"): 2932 annotation_tool_initial = "snpsift" 2933 annotation_file = ":".join(annotation_file.split(":")[1:]) 2934 else: 2935 annotation_tool_initial = None 2936 2937 # list of files 2938 annotation_file_list = annotation_file.replace("+", ":").split( 2939 ":" 2940 ) 2941 2942 for annotation_file in annotation_file_list: 2943 2944 if annotation_file: 2945 2946 # Annotation tool initial 2947 annotation_tool = annotation_tool_initial 2948 2949 # Find file 2950 annotation_file_found = None 2951 2952 # Expand user 2953 annotation_file = full_path(annotation_file) 2954 2955 if os.path.exists(annotation_file): 2956 annotation_file_found = annotation_file 2957 2958 else: 2959 # Find within assembly folders 2960 for annotations_database in annotations_databases: 2961 found_files = find_all( 2962 annotation_file, 2963 os.path.join( 2964 annotations_database, assembly 2965 ), 2966 ) 2967 if len(found_files) > 0: 2968 annotation_file_found = found_files[0] 2969 break 2970 if not annotation_file_found and not assembly: 2971 # Find within folders 2972 for ( 2973 annotations_database 2974 ) in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, annotations_database 2977 ) 2978 if len(found_files) > 0: 2979 annotation_file_found = found_files[0] 2980 break 2981 log.debug( 2982 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2983 ) 2984 2985 # Full path 2986 annotation_file_found = full_path(annotation_file_found) 2987 2988 if annotation_file_found: 2989 2990 database = Database(database=annotation_file_found) 2991 quick_annotation_format = database.get_format() 2992 quick_annotation_is_compressed = ( 2993 database.is_compressed() 2994 ) 2995 quick_annotation_is_indexed = os.path.exists( 2996 f"{annotation_file_found}.tbi" 2997 ) 2998 bcftools_preference = False 2999 3000 # Check Annotation Tool 3001 if not annotation_tool: 3002 if ( 3003 bcftools_preference 3004 and quick_annotation_format 3005 in ["vcf", "bed"] 3006 and quick_annotation_is_compressed 3007 and quick_annotation_is_indexed 3008 ): 3009 annotation_tool = "bcftools" 3010 elif quick_annotation_format in [ 3011 "vcf", 3012 "bed", 3013 "tsv", 3014 "tsv", 3015 "csv", 3016 "json", 3017 "tbl", 3018 "parquet", 3019 "duckdb", 3020 ]: 3021 annotation_tool = "parquet" 3022 else: 3023 log.error( 3024 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3025 ) 3026 raise ValueError( 3027 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3028 ) 3029 3030 log.debug( 3031 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3032 ) 3033 3034 # Annotation Tool dispatch 3035 if annotation_tool: 3036 if annotation_tool not in param["annotation"]: 3037 param["annotation"][annotation_tool] = {} 3038 if ( 3039 "annotations" 3040 not in param["annotation"][annotation_tool] 3041 ): 3042 param["annotation"][annotation_tool][ 3043 "annotations" 3044 ] = {} 3045 param["annotation"][annotation_tool][ 3046 "annotations" 3047 ][annotation_file_found] = annotations 3048 3049 else: 3050 log.error( 3051 f"Quick Annotation File {annotation_file} does NOT exist" 3052 ) 3053 3054 self.set_param(param) 3055 3056 if param.get("annotation", None): 3057 log.info("Annotations") 3058 if param.get("annotation", {}).get("parquet", None): 3059 log.info("Annotations 'parquet'...") 3060 self.annotation_parquet() 3061 if param.get("annotation", {}).get("bcftools", None): 3062 log.info("Annotations 'bcftools'...") 3063 self.annotation_bcftools() 3064 if param.get("annotation", {}).get("snpsift", None): 3065 log.info("Annotations 'snpsift'...") 3066 self.annotation_snpsift() 3067 if param.get("annotation", {}).get("annovar", None): 3068 log.info("Annotations 'annovar'...") 3069 self.annotation_annovar() 3070 if param.get("annotation", {}).get("snpeff", None): 3071 log.info("Annotations 'snpeff'...") 3072 self.annotation_snpeff() 3073 if param.get("annotation", {}).get("exomiser", None) is not None: 3074 log.info("Annotations 'exomiser'...") 3075 self.annotation_exomiser() 3076 if param.get("annotation", {}).get("splice", None) is not None: 3077 log.info("Annotations 'splice' ...") 3078 self.annotation_splice() 3079 3080 # Explode INFOS fields into table fields 3081 if self.get_explode_infos(): 3082 self.explode_infos( 3083 prefix=self.get_explode_infos_prefix(), 3084 fields=self.get_explode_infos_fields(), 3085 force=True, 3086 ) 3087 3088 def annotation_snpsift(self, threads: int = None) -> None: 3089 """ 3090 This function annotate with bcftools 3091 3092 :param threads: Number of threads to use 3093 :return: the value of the variable "return_value". 3094 """ 3095 3096 # DEBUG 3097 log.debug("Start annotation with bcftools databases") 3098 3099 # Threads 3100 if not threads: 3101 threads = self.get_threads() 3102 log.debug("Threads: " + str(threads)) 3103 3104 # Config 3105 config = self.get_config() 3106 log.debug("Config: " + str(config)) 3107 3108 # Config - snpSift 3109 snpsift_bin_command = get_bin_command( 3110 bin="SnpSift.jar", 3111 tool="snpsift", 3112 bin_type="jar", 3113 config=config, 3114 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3115 ) 3116 if not snpsift_bin_command: 3117 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3118 log.error(msg_err) 3119 raise ValueError(msg_err) 3120 3121 # Config - bcftools 3122 bcftools_bin_command = get_bin_command( 3123 bin="bcftools", 3124 tool="bcftools", 3125 bin_type="bin", 3126 config=config, 3127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3128 ) 3129 if not bcftools_bin_command: 3130 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3131 log.error(msg_err) 3132 raise ValueError(msg_err) 3133 3134 # Config - BCFTools databases folders 3135 databases_folders = set( 3136 self.get_config() 3137 .get("folders", {}) 3138 .get("databases", {}) 3139 .get("annotations", ["."]) 3140 + self.get_config() 3141 .get("folders", {}) 3142 .get("databases", {}) 3143 .get("bcftools", ["."]) 3144 ) 3145 log.debug("Databases annotations: " + str(databases_folders)) 3146 3147 # Param 3148 annotations = ( 3149 self.get_param() 3150 .get("annotation", {}) 3151 .get("snpsift", {}) 3152 .get("annotations", None) 3153 ) 3154 log.debug("Annotations: " + str(annotations)) 3155 3156 # Assembly 3157 assembly = self.get_param().get( 3158 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3159 ) 3160 3161 # Data 3162 table_variants = self.get_table_variants() 3163 3164 # Check if not empty 3165 log.debug("Check if not empty") 3166 sql_query_chromosomes = ( 3167 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3168 ) 3169 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3170 if not sql_query_chromosomes_df["count"][0]: 3171 log.info(f"VCF empty") 3172 return 3173 3174 # VCF header 3175 vcf_reader = self.get_header() 3176 log.debug("Initial header: " + str(vcf_reader.infos)) 3177 3178 # Existing annotations 3179 for vcf_annotation in self.get_header().infos: 3180 3181 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3182 log.debug( 3183 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3184 ) 3185 3186 if annotations: 3187 3188 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3189 3190 # Export VCF file 3191 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3192 3193 # Init 3194 commands = {} 3195 3196 for annotation in annotations: 3197 annotation_fields = annotations[annotation] 3198 3199 # Annotation Name 3200 annotation_name = os.path.basename(annotation) 3201 3202 if not annotation_fields: 3203 annotation_fields = {"INFO": None} 3204 3205 log.debug(f"Annotation '{annotation_name}'") 3206 log.debug( 3207 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3208 ) 3209 3210 # Create Database 3211 database = Database( 3212 database=annotation, 3213 databases_folders=databases_folders, 3214 assembly=assembly, 3215 ) 3216 3217 # Find files 3218 db_file = database.get_database() 3219 db_file = full_path(db_file) 3220 db_hdr_file = database.get_header_file() 3221 db_hdr_file = full_path(db_hdr_file) 3222 db_file_type = database.get_format() 3223 db_tbi_file = f"{db_file}.tbi" 3224 db_file_compressed = database.is_compressed() 3225 3226 # Check if compressed 3227 if not db_file_compressed: 3228 log.error( 3229 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3230 ) 3231 raise ValueError( 3232 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3233 ) 3234 3235 # Check if indexed 3236 if not os.path.exists(db_tbi_file): 3237 log.error( 3238 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3239 ) 3240 raise ValueError( 3241 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3242 ) 3243 3244 # Check index - try to create if not exists 3245 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3246 log.error("Annotation failed: database not valid") 3247 log.error(f"Annotation annotation file: {db_file}") 3248 log.error(f"Annotation annotation header: {db_hdr_file}") 3249 log.error(f"Annotation annotation index: {db_tbi_file}") 3250 raise ValueError( 3251 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3252 ) 3253 else: 3254 3255 log.debug( 3256 f"Annotation '{annotation}' - file: " 3257 + str(db_file) 3258 + " and " 3259 + str(db_hdr_file) 3260 ) 3261 3262 # Load header as VCF object 3263 db_hdr_vcf = Variants(input=db_hdr_file) 3264 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3265 log.debug( 3266 "Annotation database header: " 3267 + str(db_hdr_vcf_header_infos) 3268 ) 3269 3270 # For all fields in database 3271 annotation_fields_full = False 3272 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3273 annotation_fields = { 3274 key: key for key in db_hdr_vcf_header_infos 3275 } 3276 log.debug( 3277 "Annotation database header - All annotations added: " 3278 + str(annotation_fields) 3279 ) 3280 annotation_fields_full = True 3281 3282 # # Create file for field rename 3283 # log.debug("Create file for field rename") 3284 # tmp_rename = NamedTemporaryFile( 3285 # prefix=self.get_prefix(), 3286 # dir=self.get_tmp_dir(), 3287 # suffix=".rename", 3288 # delete=False, 3289 # ) 3290 # tmp_rename_name = tmp_rename.name 3291 # tmp_files.append(tmp_rename_name) 3292 3293 # Number of fields 3294 nb_annotation_field = 0 3295 annotation_list = [] 3296 annotation_infos_rename_list = [] 3297 3298 for annotation_field in annotation_fields: 3299 3300 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3301 annotation_fields_new_name = annotation_fields.get( 3302 annotation_field, annotation_field 3303 ) 3304 if not annotation_fields_new_name: 3305 annotation_fields_new_name = annotation_field 3306 3307 # Check if field is in DB and if field is not elready in input data 3308 if ( 3309 annotation_field in db_hdr_vcf.get_header().infos 3310 and annotation_fields_new_name 3311 not in self.get_header().infos 3312 ): 3313 3314 log.info( 3315 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3316 ) 3317 3318 # BCFTools annotate param to rename fields 3319 if annotation_field != annotation_fields_new_name: 3320 annotation_infos_rename_list.append( 3321 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3322 ) 3323 3324 # Add INFO field to header 3325 db_hdr_vcf_header_infos_number = ( 3326 db_hdr_vcf_header_infos[annotation_field].num or "." 3327 ) 3328 db_hdr_vcf_header_infos_type = ( 3329 db_hdr_vcf_header_infos[annotation_field].type 3330 or "String" 3331 ) 3332 db_hdr_vcf_header_infos_description = ( 3333 db_hdr_vcf_header_infos[annotation_field].desc 3334 or f"{annotation_field} description" 3335 ) 3336 db_hdr_vcf_header_infos_source = ( 3337 db_hdr_vcf_header_infos[annotation_field].source 3338 or "unknown" 3339 ) 3340 db_hdr_vcf_header_infos_version = ( 3341 db_hdr_vcf_header_infos[annotation_field].version 3342 or "unknown" 3343 ) 3344 3345 vcf_reader.infos[annotation_fields_new_name] = ( 3346 vcf.parser._Info( 3347 annotation_fields_new_name, 3348 db_hdr_vcf_header_infos_number, 3349 db_hdr_vcf_header_infos_type, 3350 db_hdr_vcf_header_infos_description, 3351 db_hdr_vcf_header_infos_source, 3352 db_hdr_vcf_header_infos_version, 3353 self.code_type_map[ 3354 db_hdr_vcf_header_infos_type 3355 ], 3356 ) 3357 ) 3358 3359 annotation_list.append(annotation_field) 3360 3361 nb_annotation_field += 1 3362 3363 else: 3364 3365 if ( 3366 annotation_field 3367 not in db_hdr_vcf.get_header().infos 3368 ): 3369 log.warning( 3370 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3371 ) 3372 if ( 3373 annotation_fields_new_name 3374 in self.get_header().infos 3375 ): 3376 log.warning( 3377 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3378 ) 3379 3380 log.info( 3381 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3382 ) 3383 3384 annotation_infos = ",".join(annotation_list) 3385 3386 if annotation_infos != "": 3387 3388 # Annotated VCF (and error file) 3389 tmp_annotation_vcf_name = os.path.join( 3390 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3391 ) 3392 tmp_annotation_vcf_name_err = ( 3393 tmp_annotation_vcf_name + ".err" 3394 ) 3395 3396 # Add fields to annotate 3397 if not annotation_fields_full: 3398 annotation_infos_option = f"-info {annotation_infos}" 3399 else: 3400 annotation_infos_option = "" 3401 3402 # Info fields rename 3403 if annotation_infos_rename_list: 3404 annotation_infos_rename = " -c " + ",".join( 3405 annotation_infos_rename_list 3406 ) 3407 else: 3408 annotation_infos_rename = "" 3409 3410 # Annotate command 3411 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3412 3413 # Add command 3414 commands[command_annotate] = tmp_annotation_vcf_name 3415 3416 if commands: 3417 3418 # Export VCF file 3419 self.export_variant_vcf( 3420 vcf_file=tmp_vcf_name, 3421 remove_info=True, 3422 add_samples=False, 3423 index=True, 3424 ) 3425 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3426 3427 # Num command 3428 nb_command = 0 3429 3430 # Annotate 3431 for command_annotate in commands: 3432 nb_command += 1 3433 log.info( 3434 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3435 ) 3436 log.debug(f"command_annotate={command_annotate}") 3437 run_parallel_commands([command_annotate], threads) 3438 3439 # Debug 3440 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3441 3442 # Update variants 3443 log.info( 3444 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3445 ) 3446 self.update_from_vcf(commands[command_annotate]) 3447 3448 def annotation_bcftools(self, threads: int = None) -> None: 3449 """ 3450 This function annotate with bcftools 3451 3452 :param threads: Number of threads to use 3453 :return: the value of the variable "return_value". 3454 """ 3455 3456 # DEBUG 3457 log.debug("Start annotation with bcftools databases") 3458 3459 # Threads 3460 if not threads: 3461 threads = self.get_threads() 3462 log.debug("Threads: " + str(threads)) 3463 3464 # Config 3465 config = self.get_config() 3466 log.debug("Config: " + str(config)) 3467 3468 # DEBUG 3469 delete_tmp = True 3470 if self.get_config().get("verbosity", "warning") in ["debug"]: 3471 delete_tmp = False 3472 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3473 3474 # Config - BCFTools bin command 3475 bcftools_bin_command = get_bin_command( 3476 bin="bcftools", 3477 tool="bcftools", 3478 bin_type="bin", 3479 config=config, 3480 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3481 ) 3482 if not bcftools_bin_command: 3483 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3484 log.error(msg_err) 3485 raise ValueError(msg_err) 3486 3487 # Config - BCFTools databases folders 3488 databases_folders = set( 3489 self.get_config() 3490 .get("folders", {}) 3491 .get("databases", {}) 3492 .get("annotations", ["."]) 3493 + self.get_config() 3494 .get("folders", {}) 3495 .get("databases", {}) 3496 .get("bcftools", ["."]) 3497 ) 3498 log.debug("Databases annotations: " + str(databases_folders)) 3499 3500 # Param 3501 annotations = ( 3502 self.get_param() 3503 .get("annotation", {}) 3504 .get("bcftools", {}) 3505 .get("annotations", None) 3506 ) 3507 log.debug("Annotations: " + str(annotations)) 3508 3509 # Assembly 3510 assembly = self.get_param().get( 3511 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3512 ) 3513 3514 # Data 3515 table_variants = self.get_table_variants() 3516 3517 # Check if not empty 3518 log.debug("Check if not empty") 3519 sql_query_chromosomes = ( 3520 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3521 ) 3522 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3523 if not sql_query_chromosomes_df["count"][0]: 3524 log.info(f"VCF empty") 3525 return 3526 3527 # Export in VCF 3528 log.debug("Create initial file to annotate") 3529 tmp_vcf = NamedTemporaryFile( 3530 prefix=self.get_prefix(), 3531 dir=self.get_tmp_dir(), 3532 suffix=".vcf.gz", 3533 delete=False, 3534 ) 3535 tmp_vcf_name = tmp_vcf.name 3536 3537 # VCF header 3538 vcf_reader = self.get_header() 3539 log.debug("Initial header: " + str(vcf_reader.infos)) 3540 3541 # Existing annotations 3542 for vcf_annotation in self.get_header().infos: 3543 3544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3545 log.debug( 3546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3547 ) 3548 3549 if annotations: 3550 3551 tmp_ann_vcf_list = [] 3552 commands = [] 3553 tmp_files = [] 3554 err_files = [] 3555 3556 for annotation in annotations: 3557 annotation_fields = annotations[annotation] 3558 3559 # Annotation Name 3560 annotation_name = os.path.basename(annotation) 3561 3562 if not annotation_fields: 3563 annotation_fields = {"INFO": None} 3564 3565 log.debug(f"Annotation '{annotation_name}'") 3566 log.debug( 3567 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3568 ) 3569 3570 # Create Database 3571 database = Database( 3572 database=annotation, 3573 databases_folders=databases_folders, 3574 assembly=assembly, 3575 ) 3576 3577 # Find files 3578 db_file = database.get_database() 3579 db_file = full_path(db_file) 3580 db_hdr_file = database.get_header_file() 3581 db_hdr_file = full_path(db_hdr_file) 3582 db_file_type = database.get_format() 3583 db_tbi_file = f"{db_file}.tbi" 3584 db_file_compressed = database.is_compressed() 3585 3586 # Check if compressed 3587 if not db_file_compressed: 3588 log.error( 3589 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3590 ) 3591 raise ValueError( 3592 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3593 ) 3594 3595 # Check if indexed 3596 if not os.path.exists(db_tbi_file): 3597 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3598 raise ValueError( 3599 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3600 ) 3601 3602 # Check index - try to create if not exists 3603 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3604 log.error("Annotation failed: database not valid") 3605 log.error(f"Annotation annotation file: {db_file}") 3606 log.error(f"Annotation annotation header: {db_hdr_file}") 3607 log.error(f"Annotation annotation index: {db_tbi_file}") 3608 raise ValueError( 3609 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3610 ) 3611 else: 3612 3613 log.debug( 3614 f"Annotation '{annotation}' - file: " 3615 + str(db_file) 3616 + " and " 3617 + str(db_hdr_file) 3618 ) 3619 3620 # Load header as VCF object 3621 db_hdr_vcf = Variants(input=db_hdr_file) 3622 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3623 log.debug( 3624 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3625 ) 3626 3627 # For all fields in database 3628 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3629 annotation_fields = { 3630 key: key for key in db_hdr_vcf_header_infos 3631 } 3632 log.debug( 3633 "Annotation database header - All annotations added: " 3634 + str(annotation_fields) 3635 ) 3636 3637 # Number of fields 3638 nb_annotation_field = 0 3639 annotation_list = [] 3640 3641 for annotation_field in annotation_fields: 3642 3643 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3644 annotation_fields_new_name = annotation_fields.get( 3645 annotation_field, annotation_field 3646 ) 3647 if not annotation_fields_new_name: 3648 annotation_fields_new_name = annotation_field 3649 3650 # Check if field is in DB and if field is not elready in input data 3651 if ( 3652 annotation_field in db_hdr_vcf.get_header().infos 3653 and annotation_fields_new_name 3654 not in self.get_header().infos 3655 ): 3656 3657 log.info( 3658 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3659 ) 3660 3661 # Add INFO field to header 3662 db_hdr_vcf_header_infos_number = ( 3663 db_hdr_vcf_header_infos[annotation_field].num or "." 3664 ) 3665 db_hdr_vcf_header_infos_type = ( 3666 db_hdr_vcf_header_infos[annotation_field].type 3667 or "String" 3668 ) 3669 db_hdr_vcf_header_infos_description = ( 3670 db_hdr_vcf_header_infos[annotation_field].desc 3671 or f"{annotation_field} description" 3672 ) 3673 db_hdr_vcf_header_infos_source = ( 3674 db_hdr_vcf_header_infos[annotation_field].source 3675 or "unknown" 3676 ) 3677 db_hdr_vcf_header_infos_version = ( 3678 db_hdr_vcf_header_infos[annotation_field].version 3679 or "unknown" 3680 ) 3681 3682 vcf_reader.infos[annotation_fields_new_name] = ( 3683 vcf.parser._Info( 3684 annotation_fields_new_name, 3685 db_hdr_vcf_header_infos_number, 3686 db_hdr_vcf_header_infos_type, 3687 db_hdr_vcf_header_infos_description, 3688 db_hdr_vcf_header_infos_source, 3689 db_hdr_vcf_header_infos_version, 3690 self.code_type_map[db_hdr_vcf_header_infos_type], 3691 ) 3692 ) 3693 3694 # annotation_list.append(annotation_field) 3695 if annotation_field != annotation_fields_new_name: 3696 annotation_list.append( 3697 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3698 ) 3699 else: 3700 annotation_list.append(annotation_field) 3701 3702 nb_annotation_field += 1 3703 3704 else: 3705 3706 if annotation_field not in db_hdr_vcf.get_header().infos: 3707 log.warning( 3708 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3709 ) 3710 if annotation_fields_new_name in self.get_header().infos: 3711 log.warning( 3712 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3713 ) 3714 3715 log.info( 3716 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3717 ) 3718 3719 annotation_infos = ",".join(annotation_list) 3720 3721 if annotation_infos != "": 3722 3723 # Protect header for bcftools (remove "#CHROM" and variants line) 3724 log.debug("Protect Header file - remove #CHROM line if exists") 3725 tmp_header_vcf = NamedTemporaryFile( 3726 prefix=self.get_prefix(), 3727 dir=self.get_tmp_dir(), 3728 suffix=".hdr", 3729 delete=False, 3730 ) 3731 tmp_header_vcf_name = tmp_header_vcf.name 3732 tmp_files.append(tmp_header_vcf_name) 3733 # Command 3734 if db_hdr_file.endswith(".gz"): 3735 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3736 else: 3737 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3738 # Run 3739 run_parallel_commands([command_extract_header], 1) 3740 3741 # Find chomosomes 3742 log.debug("Find chromosomes ") 3743 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3744 sql_query_chromosomes_df = self.get_query_to_df( 3745 sql_query_chromosomes 3746 ) 3747 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3748 3749 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3750 3751 # BED columns in the annotation file 3752 if db_file_type in ["bed"]: 3753 annotation_infos = "CHROM,POS,POS," + annotation_infos 3754 3755 for chrom in chomosomes_list: 3756 3757 # Create BED on initial VCF 3758 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3759 tmp_bed = NamedTemporaryFile( 3760 prefix=self.get_prefix(), 3761 dir=self.get_tmp_dir(), 3762 suffix=".bed", 3763 delete=False, 3764 ) 3765 tmp_bed_name = tmp_bed.name 3766 tmp_files.append(tmp_bed_name) 3767 3768 # Detecte regions 3769 log.debug( 3770 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3771 ) 3772 window = 1000000 3773 sql_query_intervals_for_bed = f""" 3774 SELECT \"#CHROM\", 3775 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3776 \"POS\"+{window} 3777 FROM {table_variants} as table_variants 3778 WHERE table_variants.\"#CHROM\" = '{chrom}' 3779 """ 3780 regions = self.conn.execute( 3781 sql_query_intervals_for_bed 3782 ).fetchall() 3783 merged_regions = merge_regions(regions) 3784 log.debug( 3785 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3786 ) 3787 3788 header = ["#CHROM", "START", "END"] 3789 with open(tmp_bed_name, "w") as f: 3790 # Write the header with tab delimiter 3791 f.write("\t".join(header) + "\n") 3792 for d in merged_regions: 3793 # Write each data row with tab delimiter 3794 f.write("\t".join(map(str, d)) + "\n") 3795 3796 # Tmp files 3797 tmp_annotation_vcf = NamedTemporaryFile( 3798 prefix=self.get_prefix(), 3799 dir=self.get_tmp_dir(), 3800 suffix=".vcf.gz", 3801 delete=False, 3802 ) 3803 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3804 tmp_files.append(tmp_annotation_vcf_name) 3805 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3806 tmp_annotation_vcf_name_err = ( 3807 tmp_annotation_vcf_name + ".err" 3808 ) 3809 err_files.append(tmp_annotation_vcf_name_err) 3810 3811 # Annotate Command 3812 log.debug( 3813 f"Annotation '{annotation}' - add bcftools command" 3814 ) 3815 3816 # Command 3817 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3818 3819 # Add command 3820 commands.append(command_annotate) 3821 3822 # if some commands 3823 if commands: 3824 3825 # Export VCF file 3826 self.export_variant_vcf( 3827 vcf_file=tmp_vcf_name, 3828 remove_info=True, 3829 add_samples=False, 3830 index=True, 3831 ) 3832 3833 # Threads 3834 # calculate threads for annotated commands 3835 if commands: 3836 threads_bcftools_annotate = round(threads / len(commands)) 3837 else: 3838 threads_bcftools_annotate = 1 3839 3840 if not threads_bcftools_annotate: 3841 threads_bcftools_annotate = 1 3842 3843 # Add threads option to bcftools commands 3844 if threads_bcftools_annotate > 1: 3845 commands_threaded = [] 3846 for command in commands: 3847 commands_threaded.append( 3848 command.replace( 3849 f"{bcftools_bin_command} annotate ", 3850 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3851 ) 3852 ) 3853 commands = commands_threaded 3854 3855 # Command annotation multithreading 3856 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3857 log.info( 3858 f"Annotation - Annotation multithreaded in " 3859 + str(len(commands)) 3860 + " commands" 3861 ) 3862 3863 run_parallel_commands(commands, threads) 3864 3865 # Merge 3866 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3867 3868 if tmp_ann_vcf_list_cmd: 3869 3870 # Tmp file 3871 tmp_annotate_vcf = NamedTemporaryFile( 3872 prefix=self.get_prefix(), 3873 dir=self.get_tmp_dir(), 3874 suffix=".vcf.gz", 3875 delete=True, 3876 ) 3877 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3878 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3879 err_files.append(tmp_annotate_vcf_name_err) 3880 3881 # Tmp file remove command 3882 tmp_files_remove_command = "" 3883 if tmp_files: 3884 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3885 3886 # Command merge 3887 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3888 log.info( 3889 f"Annotation - Annotation merging " 3890 + str(len(commands)) 3891 + " annotated files" 3892 ) 3893 log.debug(f"Annotation - merge command: {merge_command}") 3894 run_parallel_commands([merge_command], 1) 3895 3896 # Error messages 3897 log.info(f"Error/Warning messages:") 3898 error_message_command_all = [] 3899 error_message_command_warning = [] 3900 error_message_command_err = [] 3901 for err_file in err_files: 3902 with open(err_file, "r") as f: 3903 for line in f: 3904 message = line.strip() 3905 error_message_command_all.append(message) 3906 if line.startswith("[W::"): 3907 error_message_command_warning.append(message) 3908 if line.startswith("[E::"): 3909 error_message_command_err.append( 3910 f"{err_file}: " + message 3911 ) 3912 # log info 3913 for message in list( 3914 set(error_message_command_err + error_message_command_warning) 3915 ): 3916 log.info(f" {message}") 3917 # debug info 3918 for message in list(set(error_message_command_all)): 3919 log.debug(f" {message}") 3920 # failed 3921 if len(error_message_command_err): 3922 log.error("Annotation failed: Error in commands") 3923 raise ValueError("Annotation failed: Error in commands") 3924 3925 # Update variants 3926 log.info(f"Annotation - Updating...") 3927 self.update_from_vcf(tmp_annotate_vcf_name) 3928 3929 def annotation_exomiser(self, threads: int = None) -> None: 3930 """ 3931 This function annotate with Exomiser 3932 3933 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3934 - "analysis" (dict/file): 3935 Full analysis dictionnary parameters (see Exomiser docs). 3936 Either a dict, or a file in JSON or YAML format. 3937 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3938 Default : None 3939 - "preset" (string): 3940 Analysis preset (available in config folder). 3941 Used if no full "analysis" is provided. 3942 Default: "exome" 3943 - "phenopacket" (dict/file): 3944 Samples and phenotipic features parameters (see Exomiser docs). 3945 Either a dict, or a file in JSON or YAML format. 3946 Default: None 3947 - "subject" (dict): 3948 Sample parameters (see Exomiser docs). 3949 Example: 3950 "subject": 3951 { 3952 "id": "ISDBM322017", 3953 "sex": "FEMALE" 3954 } 3955 Default: None 3956 - "sample" (string): 3957 Sample name to construct "subject" section: 3958 "subject": 3959 { 3960 "id": "<sample>", 3961 "sex": "UNKNOWN_SEX" 3962 } 3963 Default: None 3964 - "phenotypicFeatures" (dict) 3965 Phenotypic features to construct "subject" section. 3966 Example: 3967 "phenotypicFeatures": 3968 [ 3969 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3970 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3971 ] 3972 - "hpo" (list) 3973 List of HPO ids as phenotypic features. 3974 Example: 3975 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3976 Default: [] 3977 - "outputOptions" (dict): 3978 Output options (see Exomiser docs). 3979 Default: 3980 "output_options" = 3981 { 3982 "outputContributingVariantsOnly": False, 3983 "numGenes": 0, 3984 "outputFormats": ["TSV_VARIANT", "VCF"] 3985 } 3986 - "transcript_source" (string): 3987 Transcript source (either "refseq", "ucsc", "ensembl") 3988 Default: "refseq" 3989 - "exomiser_to_info" (boolean): 3990 Add exomiser TSV file columns as INFO fields in VCF. 3991 Default: False 3992 - "release" (string): 3993 Exomise database release. 3994 If not exists, database release will be downloaded (take a while). 3995 Default: None (provided by application.properties configuration file) 3996 - "exomiser_application_properties" (file): 3997 Exomiser configuration file (see Exomiser docs). 3998 Useful to automatically download databases (especially for specific genome databases). 3999 4000 Notes: 4001 - If no sample in parameters, first sample in VCF will be chosen 4002 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4003 4004 :param threads: The number of threads to use 4005 :return: None. 4006 """ 4007 4008 # DEBUG 4009 log.debug("Start annotation with Exomiser databases") 4010 4011 # Threads 4012 if not threads: 4013 threads = self.get_threads() 4014 log.debug("Threads: " + str(threads)) 4015 4016 # Config 4017 config = self.get_config() 4018 log.debug("Config: " + str(config)) 4019 4020 # Config - Folders - Databases 4021 databases_folders = ( 4022 config.get("folders", {}) 4023 .get("databases", {}) 4024 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4025 ) 4026 databases_folders = full_path(databases_folders) 4027 if not os.path.exists(databases_folders): 4028 log.error(f"Databases annotations: {databases_folders} NOT found") 4029 log.debug("Databases annotations: " + str(databases_folders)) 4030 4031 # Config - Exomiser 4032 exomiser_bin_command = get_bin_command( 4033 bin="exomiser-cli*.jar", 4034 tool="exomiser", 4035 bin_type="jar", 4036 config=config, 4037 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4038 ) 4039 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4040 if not exomiser_bin_command: 4041 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4042 log.error(msg_err) 4043 raise ValueError(msg_err) 4044 4045 # Param 4046 param = self.get_param() 4047 log.debug("Param: " + str(param)) 4048 4049 # Param - Exomiser 4050 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4051 log.debug(f"Param Exomiser: {param_exomiser}") 4052 4053 # Param - Assembly 4054 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4055 log.debug("Assembly: " + str(assembly)) 4056 4057 # Data 4058 table_variants = self.get_table_variants() 4059 4060 # Check if not empty 4061 log.debug("Check if not empty") 4062 sql_query_chromosomes = ( 4063 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4064 ) 4065 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4066 log.info(f"VCF empty") 4067 return False 4068 4069 # VCF header 4070 vcf_reader = self.get_header() 4071 log.debug("Initial header: " + str(vcf_reader.infos)) 4072 4073 # Samples 4074 samples = self.get_header_sample_list() 4075 if not samples: 4076 log.error("No Samples in VCF") 4077 return False 4078 log.debug(f"Samples: {samples}") 4079 4080 # Memory limit 4081 memory_limit = self.get_memory("8G") 4082 log.debug(f"memory_limit: {memory_limit}") 4083 4084 # Exomiser java options 4085 exomiser_java_options = ( 4086 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4087 ) 4088 log.debug(f"Exomiser java options: {exomiser_java_options}") 4089 4090 # Download Exomiser (if not exists) 4091 exomiser_release = param_exomiser.get("release", None) 4092 exomiser_application_properties = param_exomiser.get( 4093 "exomiser_application_properties", None 4094 ) 4095 databases_download_exomiser( 4096 assemblies=[assembly], 4097 exomiser_folder=databases_folders, 4098 exomiser_release=exomiser_release, 4099 exomiser_phenotype_release=exomiser_release, 4100 exomiser_application_properties=exomiser_application_properties, 4101 ) 4102 4103 # Force annotation 4104 force_update_annotation = True 4105 4106 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4107 log.debug("Start annotation Exomiser") 4108 4109 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4110 4111 # tmp_dir = "/tmp/exomiser" 4112 4113 ### ANALYSIS ### 4114 ################ 4115 4116 # Create analysis.json through analysis dict 4117 # either analysis in param or by default 4118 # depending on preset exome/genome) 4119 4120 # Init analysis dict 4121 param_exomiser_analysis_dict = {} 4122 4123 # analysis from param 4124 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4125 param_exomiser_analysis = full_path(param_exomiser_analysis) 4126 4127 # If analysis in param -> load anlaysis json 4128 if param_exomiser_analysis: 4129 4130 # If param analysis is a file and exists 4131 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4132 param_exomiser_analysis 4133 ): 4134 # Load analysis file into analysis dict (either yaml or json) 4135 with open(param_exomiser_analysis) as json_file: 4136 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4137 4138 # If param analysis is a dict 4139 elif isinstance(param_exomiser_analysis, dict): 4140 # Load analysis dict into analysis dict (either yaml or json) 4141 param_exomiser_analysis_dict = param_exomiser_analysis 4142 4143 # Error analysis type 4144 else: 4145 log.error(f"Analysis type unknown. Check param file.") 4146 raise ValueError(f"Analysis type unknown. Check param file.") 4147 4148 # Case no input analysis config file/dict 4149 # Use preset (exome/genome) to open default config file 4150 if not param_exomiser_analysis_dict: 4151 4152 # default preset 4153 default_preset = "exome" 4154 4155 # Get param preset or default preset 4156 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4157 4158 # Try to find if preset is a file 4159 if os.path.exists(param_exomiser_preset): 4160 # Preset file is provided in full path 4161 param_exomiser_analysis_default_config_file = ( 4162 param_exomiser_preset 4163 ) 4164 # elif os.path.exists(full_path(param_exomiser_preset)): 4165 # # Preset file is provided in full path 4166 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4167 elif os.path.exists( 4168 os.path.join(folder_config, param_exomiser_preset) 4169 ): 4170 # Preset file is provided a basename in config folder (can be a path with subfolders) 4171 param_exomiser_analysis_default_config_file = os.path.join( 4172 folder_config, param_exomiser_preset 4173 ) 4174 else: 4175 # Construct preset file 4176 param_exomiser_analysis_default_config_file = os.path.join( 4177 folder_config, 4178 f"preset-{param_exomiser_preset}-analysis.json", 4179 ) 4180 4181 # If preset file exists 4182 param_exomiser_analysis_default_config_file = full_path( 4183 param_exomiser_analysis_default_config_file 4184 ) 4185 if os.path.exists(param_exomiser_analysis_default_config_file): 4186 # Load prest file into analysis dict (either yaml or json) 4187 with open( 4188 param_exomiser_analysis_default_config_file 4189 ) as json_file: 4190 # param_exomiser_analysis_dict[""] = json.load(json_file) 4191 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4192 json_file 4193 ) 4194 4195 # Error preset file 4196 else: 4197 log.error( 4198 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4199 ) 4200 raise ValueError( 4201 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4202 ) 4203 4204 # If no analysis dict created 4205 if not param_exomiser_analysis_dict: 4206 log.error(f"No analysis config") 4207 raise ValueError(f"No analysis config") 4208 4209 # Log 4210 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4211 4212 ### PHENOPACKET ### 4213 ################### 4214 4215 # If no PhenoPacket in analysis dict -> check in param 4216 if "phenopacket" not in param_exomiser_analysis_dict: 4217 4218 # If PhenoPacket in param -> load anlaysis json 4219 if param_exomiser.get("phenopacket", None): 4220 4221 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4222 param_exomiser_phenopacket = full_path( 4223 param_exomiser_phenopacket 4224 ) 4225 4226 # If param phenopacket is a file and exists 4227 if isinstance( 4228 param_exomiser_phenopacket, str 4229 ) and os.path.exists(param_exomiser_phenopacket): 4230 # Load phenopacket file into analysis dict (either yaml or json) 4231 with open(param_exomiser_phenopacket) as json_file: 4232 param_exomiser_analysis_dict["phenopacket"] = ( 4233 yaml.safe_load(json_file) 4234 ) 4235 4236 # If param phenopacket is a dict 4237 elif isinstance(param_exomiser_phenopacket, dict): 4238 # Load phenopacket dict into analysis dict (either yaml or json) 4239 param_exomiser_analysis_dict["phenopacket"] = ( 4240 param_exomiser_phenopacket 4241 ) 4242 4243 # Error phenopacket type 4244 else: 4245 log.error(f"Phenopacket type unknown. Check param file.") 4246 raise ValueError( 4247 f"Phenopacket type unknown. Check param file." 4248 ) 4249 4250 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4251 if "phenopacket" not in param_exomiser_analysis_dict: 4252 4253 # Init PhenoPacket 4254 param_exomiser_analysis_dict["phenopacket"] = { 4255 "id": "analysis", 4256 "proband": {}, 4257 } 4258 4259 ### Add subject ### 4260 4261 # If subject exists 4262 param_exomiser_subject = param_exomiser.get("subject", {}) 4263 4264 # If subject not exists -> found sample ID 4265 if not param_exomiser_subject: 4266 4267 # Found sample ID in param 4268 sample = param_exomiser.get("sample", None) 4269 4270 # Find sample ID (first sample) 4271 if not sample: 4272 sample_list = self.get_header_sample_list() 4273 if len(sample_list) > 0: 4274 sample = sample_list[0] 4275 else: 4276 log.error(f"No sample found") 4277 raise ValueError(f"No sample found") 4278 4279 # Create subject 4280 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4281 4282 # Add to dict 4283 param_exomiser_analysis_dict["phenopacket"][ 4284 "subject" 4285 ] = param_exomiser_subject 4286 4287 ### Add "phenotypicFeatures" ### 4288 4289 # If phenotypicFeatures exists 4290 param_exomiser_phenotypicfeatures = param_exomiser.get( 4291 "phenotypicFeatures", [] 4292 ) 4293 4294 # If phenotypicFeatures not exists -> Try to infer from hpo list 4295 if not param_exomiser_phenotypicfeatures: 4296 4297 # Found HPO in param 4298 param_exomiser_hpo = param_exomiser.get("hpo", []) 4299 4300 # Split HPO if list in string format separated by comma 4301 if isinstance(param_exomiser_hpo, str): 4302 param_exomiser_hpo = param_exomiser_hpo.split(",") 4303 4304 # Create HPO list 4305 for hpo in param_exomiser_hpo: 4306 hpo_clean = re.sub("[^0-9]", "", hpo) 4307 param_exomiser_phenotypicfeatures.append( 4308 { 4309 "type": { 4310 "id": f"HP:{hpo_clean}", 4311 "label": f"HP:{hpo_clean}", 4312 } 4313 } 4314 ) 4315 4316 # Add to dict 4317 param_exomiser_analysis_dict["phenopacket"][ 4318 "phenotypicFeatures" 4319 ] = param_exomiser_phenotypicfeatures 4320 4321 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4322 if not param_exomiser_phenotypicfeatures: 4323 for step in param_exomiser_analysis_dict.get( 4324 "analysis", {} 4325 ).get("steps", []): 4326 if "hiPhivePrioritiser" in step: 4327 param_exomiser_analysis_dict.get("analysis", {}).get( 4328 "steps", [] 4329 ).remove(step) 4330 4331 ### Add Input File ### 4332 4333 # Initial file name and htsFiles 4334 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4335 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4336 { 4337 "uri": tmp_vcf_name, 4338 "htsFormat": "VCF", 4339 "genomeAssembly": assembly, 4340 } 4341 ] 4342 4343 ### Add metaData ### 4344 4345 # If metaData not in analysis dict 4346 if "metaData" not in param_exomiser_analysis_dict: 4347 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4348 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4349 "createdBy": "howard", 4350 "phenopacketSchemaVersion": 1, 4351 } 4352 4353 ### OutputOptions ### 4354 4355 # Init output result folder 4356 output_results = os.path.join(tmp_dir, "results") 4357 4358 # If no outputOptions in analysis dict 4359 if "outputOptions" not in param_exomiser_analysis_dict: 4360 4361 # default output formats 4362 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4363 4364 # Get outputOptions in param 4365 output_options = param_exomiser.get("outputOptions", None) 4366 4367 # If no output_options in param -> check 4368 if not output_options: 4369 output_options = { 4370 "outputContributingVariantsOnly": False, 4371 "numGenes": 0, 4372 "outputFormats": defaut_output_formats, 4373 } 4374 4375 # Replace outputDirectory in output options 4376 output_options["outputDirectory"] = output_results 4377 output_options["outputFileName"] = "howard" 4378 4379 # Add outputOptions in analysis dict 4380 param_exomiser_analysis_dict["outputOptions"] = output_options 4381 4382 else: 4383 4384 # Replace output_results and output format (if exists in param) 4385 param_exomiser_analysis_dict["outputOptions"][ 4386 "outputDirectory" 4387 ] = output_results 4388 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4389 list( 4390 set( 4391 param_exomiser_analysis_dict.get( 4392 "outputOptions", {} 4393 ).get("outputFormats", []) 4394 + ["TSV_VARIANT", "VCF"] 4395 ) 4396 ) 4397 ) 4398 4399 # log 4400 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4401 4402 ### ANALYSIS FILE ### 4403 ##################### 4404 4405 ### Full JSON analysis config file ### 4406 4407 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4408 with open(exomiser_analysis, "w") as fp: 4409 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4410 4411 ### SPLIT analysis and sample config files 4412 4413 # Splitted analysis dict 4414 param_exomiser_analysis_dict_for_split = ( 4415 param_exomiser_analysis_dict.copy() 4416 ) 4417 4418 # Phenopacket JSON file 4419 exomiser_analysis_phenopacket = os.path.join( 4420 tmp_dir, "analysis_phenopacket.json" 4421 ) 4422 with open(exomiser_analysis_phenopacket, "w") as fp: 4423 json.dump( 4424 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4425 fp, 4426 indent=4, 4427 ) 4428 4429 # Analysis JSON file without Phenopacket parameters 4430 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4431 exomiser_analysis_analysis = os.path.join( 4432 tmp_dir, "analysis_analysis.json" 4433 ) 4434 with open(exomiser_analysis_analysis, "w") as fp: 4435 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4436 4437 ### INITAL VCF file ### 4438 ####################### 4439 4440 ### Create list of samples to use and include inti initial VCF file #### 4441 4442 # Subject (main sample) 4443 # Get sample ID in analysis dict 4444 sample_subject = ( 4445 param_exomiser_analysis_dict.get("phenopacket", {}) 4446 .get("subject", {}) 4447 .get("id", None) 4448 ) 4449 sample_proband = ( 4450 param_exomiser_analysis_dict.get("phenopacket", {}) 4451 .get("proband", {}) 4452 .get("subject", {}) 4453 .get("id", None) 4454 ) 4455 sample = [] 4456 if sample_subject: 4457 sample.append(sample_subject) 4458 if sample_proband: 4459 sample.append(sample_proband) 4460 4461 # Get sample ID within Pedigree 4462 pedigree_persons_list = ( 4463 param_exomiser_analysis_dict.get("phenopacket", {}) 4464 .get("pedigree", {}) 4465 .get("persons", {}) 4466 ) 4467 4468 # Create list with all sample ID in pedigree (if exists) 4469 pedigree_persons = [] 4470 for person in pedigree_persons_list: 4471 pedigree_persons.append(person.get("individualId")) 4472 4473 # Concat subject sample ID and samples ID in pedigreesamples 4474 samples = list(set(sample + pedigree_persons)) 4475 4476 # Check if sample list is not empty 4477 if not samples: 4478 log.error(f"No samples found") 4479 raise ValueError(f"No samples found") 4480 4481 # Create VCF with sample (either sample in param or first one by default) 4482 # Export VCF file 4483 self.export_variant_vcf( 4484 vcf_file=tmp_vcf_name, 4485 remove_info=True, 4486 add_samples=True, 4487 list_samples=samples, 4488 index=False, 4489 ) 4490 4491 ### Execute Exomiser ### 4492 ######################## 4493 4494 # Init command 4495 exomiser_command = "" 4496 4497 # Command exomiser options 4498 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4499 4500 # Release 4501 exomiser_release = param_exomiser.get("release", None) 4502 if exomiser_release: 4503 # phenotype data version 4504 exomiser_options += ( 4505 f" --exomiser.phenotype.data-version={exomiser_release} " 4506 ) 4507 # data version 4508 exomiser_options += ( 4509 f" --exomiser.{assembly}.data-version={exomiser_release} " 4510 ) 4511 # variant white list 4512 variant_white_list_file = ( 4513 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4514 ) 4515 if os.path.exists( 4516 os.path.join( 4517 databases_folders, assembly, variant_white_list_file 4518 ) 4519 ): 4520 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4521 4522 # transcript_source 4523 transcript_source = param_exomiser.get( 4524 "transcript_source", None 4525 ) # ucsc, refseq, ensembl 4526 if transcript_source: 4527 exomiser_options += ( 4528 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4529 ) 4530 4531 # If analysis contain proband param 4532 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4533 "proband", {} 4534 ): 4535 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4536 4537 # If no proband (usually uniq sample) 4538 else: 4539 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4540 4541 # Log 4542 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4543 4544 # Run command 4545 result = subprocess.call( 4546 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4547 ) 4548 if result: 4549 log.error("Exomiser command failed") 4550 raise ValueError("Exomiser command failed") 4551 4552 ### RESULTS ### 4553 ############### 4554 4555 ### Annotate with TSV fields ### 4556 4557 # Init result tsv file 4558 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4559 4560 # Init result tsv file 4561 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4562 4563 # Parse TSV file and explode columns in INFO field 4564 if exomiser_to_info and os.path.exists(output_results_tsv): 4565 4566 # Log 4567 log.debug("Exomiser columns to VCF INFO field") 4568 4569 # Retrieve columns and types 4570 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4571 output_results_tsv_df = self.get_query_to_df(query) 4572 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4573 4574 # Init concat fields for update 4575 sql_query_update_concat_fields = [] 4576 4577 # Fields to avoid 4578 fields_to_avoid = [ 4579 "CONTIG", 4580 "START", 4581 "END", 4582 "REF", 4583 "ALT", 4584 "QUAL", 4585 "FILTER", 4586 "GENOTYPE", 4587 ] 4588 4589 # List all columns to add into header 4590 for header_column in output_results_tsv_columns: 4591 4592 # If header column is enable 4593 if header_column not in fields_to_avoid: 4594 4595 # Header info type 4596 header_info_type = "String" 4597 header_column_df = output_results_tsv_df[header_column] 4598 header_column_df_dtype = header_column_df.dtype 4599 if header_column_df_dtype == object: 4600 if ( 4601 pd.to_numeric(header_column_df, errors="coerce") 4602 .notnull() 4603 .all() 4604 ): 4605 header_info_type = "Float" 4606 else: 4607 header_info_type = "Integer" 4608 4609 # Header info 4610 characters_to_validate = ["-"] 4611 pattern = "[" + "".join(characters_to_validate) + "]" 4612 header_info_name = re.sub( 4613 pattern, 4614 "_", 4615 f"Exomiser_{header_column}".replace("#", ""), 4616 ) 4617 header_info_number = "." 4618 header_info_description = ( 4619 f"Exomiser {header_column} annotation" 4620 ) 4621 header_info_source = "Exomiser" 4622 header_info_version = "unknown" 4623 header_info_code = CODE_TYPE_MAP[header_info_type] 4624 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4625 header_info_name, 4626 header_info_number, 4627 header_info_type, 4628 header_info_description, 4629 header_info_source, 4630 header_info_version, 4631 header_info_code, 4632 ) 4633 4634 # Add field to add for update to concat fields 4635 sql_query_update_concat_fields.append( 4636 f""" 4637 CASE 4638 WHEN table_parquet."{header_column}" NOT IN ('','.') 4639 THEN concat( 4640 '{header_info_name}=', 4641 table_parquet."{header_column}", 4642 ';' 4643 ) 4644 4645 ELSE '' 4646 END 4647 """ 4648 ) 4649 4650 # Update query 4651 sql_query_update = f""" 4652 UPDATE {table_variants} as table_variants 4653 SET INFO = concat( 4654 CASE 4655 WHEN INFO NOT IN ('', '.') 4656 THEN INFO 4657 ELSE '' 4658 END, 4659 CASE 4660 WHEN table_variants.INFO NOT IN ('','.') 4661 THEN ';' 4662 ELSE '' 4663 END, 4664 ( 4665 SELECT 4666 concat( 4667 {",".join(sql_query_update_concat_fields)} 4668 ) 4669 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4670 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4671 AND table_parquet.\"START\" = table_variants.\"POS\" 4672 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4673 AND table_parquet.\"REF\" = table_variants.\"REF\" 4674 ) 4675 ) 4676 ; 4677 """ 4678 4679 # Update 4680 self.conn.execute(sql_query_update) 4681 4682 ### Annotate with VCF INFO field ### 4683 4684 # Init result VCF file 4685 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4686 4687 # If VCF exists 4688 if os.path.exists(output_results_vcf): 4689 4690 # Log 4691 log.debug("Exomiser result VCF update variants") 4692 4693 # Find Exomiser INFO field annotation in header 4694 with gzip.open(output_results_vcf, "rt") as f: 4695 header_list = self.read_vcf_header(f) 4696 exomiser_vcf_header = vcf.Reader( 4697 io.StringIO("\n".join(header_list)) 4698 ) 4699 4700 # Add annotation INFO field to header 4701 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4702 4703 # Update variants with VCF 4704 self.update_from_vcf(output_results_vcf) 4705 4706 return True 4707 4708 def annotation_snpeff(self, threads: int = None) -> None: 4709 """ 4710 This function annotate with snpEff 4711 4712 :param threads: The number of threads to use 4713 :return: the value of the variable "return_value". 4714 """ 4715 4716 # DEBUG 4717 log.debug("Start annotation with snpeff databases") 4718 4719 # Threads 4720 if not threads: 4721 threads = self.get_threads() 4722 log.debug("Threads: " + str(threads)) 4723 4724 # DEBUG 4725 delete_tmp = True 4726 if self.get_config().get("verbosity", "warning") in ["debug"]: 4727 delete_tmp = False 4728 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4729 4730 # Config 4731 config = self.get_config() 4732 log.debug("Config: " + str(config)) 4733 4734 # Config - Folders - Databases 4735 databases_folders = ( 4736 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4737 ) 4738 log.debug("Databases annotations: " + str(databases_folders)) 4739 4740 # # Config - Java 4741 # java_bin = get_bin( 4742 # tool="java", 4743 # bin="java", 4744 # bin_type="bin", 4745 # config=config, 4746 # default_folder="/usr/bin", 4747 # ) 4748 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4749 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4750 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4751 4752 # # Config - snpEff bin 4753 # snpeff_jar = get_bin( 4754 # tool="snpeff", 4755 # bin="snpEff.jar", 4756 # bin_type="jar", 4757 # config=config, 4758 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4759 # ) 4760 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4761 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4762 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 4764 # Config - snpEff bin command 4765 snpeff_bin_command = get_bin_command( 4766 bin="snpEff.jar", 4767 tool="snpeff", 4768 bin_type="jar", 4769 config=config, 4770 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4771 ) 4772 if not snpeff_bin_command: 4773 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4774 log.error(msg_err) 4775 raise ValueError(msg_err) 4776 4777 # Config - snpEff databases 4778 snpeff_databases = ( 4779 config.get("folders", {}) 4780 .get("databases", {}) 4781 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4782 ) 4783 snpeff_databases = full_path(snpeff_databases) 4784 if snpeff_databases is not None and snpeff_databases != "": 4785 log.debug(f"Create snpEff databases folder") 4786 if not os.path.exists(snpeff_databases): 4787 os.makedirs(snpeff_databases) 4788 4789 # Param 4790 param = self.get_param() 4791 log.debug("Param: " + str(param)) 4792 4793 # Param 4794 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4795 log.debug("Options: " + str(options)) 4796 4797 # Param - Assembly 4798 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4799 4800 # Param - Options 4801 snpeff_options = ( 4802 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4803 ) 4804 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4805 snpeff_csvstats = ( 4806 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4807 ) 4808 if snpeff_stats: 4809 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4810 snpeff_stats = full_path(snpeff_stats) 4811 snpeff_options += f" -stats {snpeff_stats}" 4812 if snpeff_csvstats: 4813 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4814 snpeff_csvstats = full_path(snpeff_csvstats) 4815 snpeff_options += f" -csvStats {snpeff_csvstats}" 4816 4817 # Data 4818 table_variants = self.get_table_variants() 4819 4820 # Check if not empty 4821 log.debug("Check if not empty") 4822 sql_query_chromosomes = ( 4823 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4824 ) 4825 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4826 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4827 log.info(f"VCF empty") 4828 return 4829 4830 # Export in VCF 4831 log.debug("Create initial file to annotate") 4832 tmp_vcf = NamedTemporaryFile( 4833 prefix=self.get_prefix(), 4834 dir=self.get_tmp_dir(), 4835 suffix=".vcf.gz", 4836 delete=True, 4837 ) 4838 tmp_vcf_name = tmp_vcf.name 4839 4840 # VCF header 4841 vcf_reader = self.get_header() 4842 log.debug("Initial header: " + str(vcf_reader.infos)) 4843 4844 # Existing annotations 4845 for vcf_annotation in self.get_header().infos: 4846 4847 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4848 log.debug( 4849 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4850 ) 4851 4852 # Memory limit 4853 # if config.get("memory", None): 4854 # memory_limit = config.get("memory", "8G") 4855 # else: 4856 # memory_limit = "8G" 4857 memory_limit = self.get_memory("8G") 4858 log.debug(f"memory_limit: {memory_limit}") 4859 4860 # snpEff java options 4861 snpeff_java_options = ( 4862 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4863 ) 4864 log.debug(f"Exomiser java options: {snpeff_java_options}") 4865 4866 force_update_annotation = True 4867 4868 if "ANN" not in self.get_header().infos or force_update_annotation: 4869 4870 # Check snpEff database 4871 log.debug(f"Check snpEff databases {[assembly]}") 4872 databases_download_snpeff( 4873 folder=snpeff_databases, assemblies=[assembly], config=config 4874 ) 4875 4876 # Export VCF file 4877 self.export_variant_vcf( 4878 vcf_file=tmp_vcf_name, 4879 remove_info=True, 4880 add_samples=False, 4881 index=True, 4882 ) 4883 4884 # Tmp file 4885 err_files = [] 4886 tmp_annotate_vcf = NamedTemporaryFile( 4887 prefix=self.get_prefix(), 4888 dir=self.get_tmp_dir(), 4889 suffix=".vcf", 4890 delete=False, 4891 ) 4892 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4893 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4894 err_files.append(tmp_annotate_vcf_name_err) 4895 4896 # Command 4897 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4898 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4899 run_parallel_commands([snpeff_command], 1) 4900 4901 # Error messages 4902 log.info(f"Error/Warning messages:") 4903 error_message_command_all = [] 4904 error_message_command_warning = [] 4905 error_message_command_err = [] 4906 for err_file in err_files: 4907 with open(err_file, "r") as f: 4908 for line in f: 4909 message = line.strip() 4910 error_message_command_all.append(message) 4911 if line.startswith("[W::"): 4912 error_message_command_warning.append(message) 4913 if line.startswith("[E::"): 4914 error_message_command_err.append(f"{err_file}: " + message) 4915 # log info 4916 for message in list( 4917 set(error_message_command_err + error_message_command_warning) 4918 ): 4919 log.info(f" {message}") 4920 # debug info 4921 for message in list(set(error_message_command_all)): 4922 log.debug(f" {message}") 4923 # failed 4924 if len(error_message_command_err): 4925 log.error("Annotation failed: Error in commands") 4926 raise ValueError("Annotation failed: Error in commands") 4927 4928 # Find annotation in header 4929 with open(tmp_annotate_vcf_name, "rt") as f: 4930 header_list = self.read_vcf_header(f) 4931 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4932 4933 for ann in annovar_vcf_header.infos: 4934 if ann not in self.get_header().infos: 4935 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4936 4937 # Update variants 4938 log.info(f"Annotation - Updating...") 4939 self.update_from_vcf(tmp_annotate_vcf_name) 4940 4941 else: 4942 if "ANN" in self.get_header().infos: 4943 log.debug(f"Existing snpEff annotations in VCF") 4944 if force_update_annotation: 4945 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4946 4947 def annotation_annovar(self, threads: int = None) -> None: 4948 """ 4949 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4950 annotations 4951 4952 :param threads: number of threads to use 4953 :return: the value of the variable "return_value". 4954 """ 4955 4956 # DEBUG 4957 log.debug("Start annotation with Annovar databases") 4958 4959 # Threads 4960 if not threads: 4961 threads = self.get_threads() 4962 log.debug("Threads: " + str(threads)) 4963 4964 # Tmp en Err files 4965 tmp_files = [] 4966 err_files = [] 4967 4968 # DEBUG 4969 delete_tmp = True 4970 if self.get_config().get("verbosity", "warning") in ["debug"]: 4971 delete_tmp = False 4972 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4973 4974 # Config 4975 config = self.get_config() 4976 log.debug("Config: " + str(config)) 4977 4978 # Config - Folders - Databases 4979 databases_folders = ( 4980 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4981 ) 4982 log.debug("Databases annotations: " + str(databases_folders)) 4983 4984 # Config - annovar bin command 4985 annovar_bin_command = get_bin_command( 4986 bin="table_annovar.pl", 4987 tool="annovar", 4988 bin_type="perl", 4989 config=config, 4990 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4991 ) 4992 if not annovar_bin_command: 4993 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4994 log.error(msg_err) 4995 raise ValueError(msg_err) 4996 4997 # Config - BCFTools bin command 4998 bcftools_bin_command = get_bin_command( 4999 bin="bcftools", 5000 tool="bcftools", 5001 bin_type="bin", 5002 config=config, 5003 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5004 ) 5005 if not bcftools_bin_command: 5006 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5007 log.error(msg_err) 5008 raise ValueError(msg_err) 5009 5010 # Config - annovar databases 5011 annovar_databases = ( 5012 config.get("folders", {}) 5013 .get("databases", {}) 5014 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5015 ) 5016 annovar_databases = full_path(annovar_databases) 5017 if annovar_databases != "" and not os.path.exists(annovar_databases): 5018 os.makedirs(annovar_databases) 5019 5020 # Param 5021 param = self.get_param() 5022 log.debug("Param: " + str(param)) 5023 5024 # Param - options 5025 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5026 log.debug("Options: " + str(options)) 5027 5028 # Param - annotations 5029 annotations = ( 5030 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5031 ) 5032 log.debug("Annotations: " + str(annotations)) 5033 5034 # Param - Assembly 5035 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5036 5037 # Annovar database assembly 5038 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5039 if annovar_databases_assembly != "" and not os.path.exists( 5040 annovar_databases_assembly 5041 ): 5042 os.makedirs(annovar_databases_assembly) 5043 5044 # Data 5045 table_variants = self.get_table_variants() 5046 5047 # Check if not empty 5048 log.debug("Check if not empty") 5049 sql_query_chromosomes = ( 5050 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5051 ) 5052 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5053 if not sql_query_chromosomes_df["count"][0]: 5054 log.info(f"VCF empty") 5055 return 5056 5057 # VCF header 5058 vcf_reader = self.get_header() 5059 log.debug("Initial header: " + str(vcf_reader.infos)) 5060 5061 # Existing annotations 5062 for vcf_annotation in self.get_header().infos: 5063 5064 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5065 log.debug( 5066 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5067 ) 5068 5069 force_update_annotation = True 5070 5071 if annotations: 5072 5073 commands = [] 5074 tmp_annotates_vcf_name_list = [] 5075 5076 # Export in VCF 5077 log.debug("Create initial file to annotate") 5078 tmp_vcf = NamedTemporaryFile( 5079 prefix=self.get_prefix(), 5080 dir=self.get_tmp_dir(), 5081 suffix=".vcf.gz", 5082 delete=False, 5083 ) 5084 tmp_vcf_name = tmp_vcf.name 5085 tmp_files.append(tmp_vcf_name) 5086 tmp_files.append(tmp_vcf_name + ".tbi") 5087 5088 # Export VCF file 5089 self.export_variant_vcf( 5090 vcf_file=tmp_vcf_name, 5091 remove_info=".", 5092 add_samples=False, 5093 index=True, 5094 ) 5095 5096 # Create file for field rename 5097 log.debug("Create file for field rename") 5098 tmp_rename = NamedTemporaryFile( 5099 prefix=self.get_prefix(), 5100 dir=self.get_tmp_dir(), 5101 suffix=".rename", 5102 delete=False, 5103 ) 5104 tmp_rename_name = tmp_rename.name 5105 tmp_files.append(tmp_rename_name) 5106 5107 # Check Annovar database 5108 log.debug( 5109 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5110 ) 5111 databases_download_annovar( 5112 folder=annovar_databases, 5113 files=list(annotations.keys()), 5114 assemblies=[assembly], 5115 ) 5116 5117 for annotation in annotations: 5118 annotation_fields = annotations[annotation] 5119 5120 if not annotation_fields: 5121 annotation_fields = {"INFO": None} 5122 5123 log.info(f"Annotations Annovar - database '{annotation}'") 5124 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5125 5126 # Tmp file for annovar 5127 err_files = [] 5128 tmp_annotate_vcf_directory = TemporaryDirectory( 5129 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5130 ) 5131 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5132 tmp_annotate_vcf_name_annovar = ( 5133 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5134 ) 5135 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5136 err_files.append(tmp_annotate_vcf_name_err) 5137 tmp_files.append(tmp_annotate_vcf_name_err) 5138 5139 # Tmp file final vcf annotated by annovar 5140 tmp_annotate_vcf = NamedTemporaryFile( 5141 prefix=self.get_prefix(), 5142 dir=self.get_tmp_dir(), 5143 suffix=".vcf.gz", 5144 delete=False, 5145 ) 5146 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5147 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5148 tmp_files.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5150 5151 # Number of fields 5152 annotation_list = [] 5153 annotation_renamed_list = [] 5154 5155 for annotation_field in annotation_fields: 5156 5157 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5158 annotation_fields_new_name = annotation_fields.get( 5159 annotation_field, annotation_field 5160 ) 5161 if not annotation_fields_new_name: 5162 annotation_fields_new_name = annotation_field 5163 5164 if ( 5165 force_update_annotation 5166 or annotation_fields_new_name not in self.get_header().infos 5167 ): 5168 annotation_list.append(annotation_field) 5169 annotation_renamed_list.append(annotation_fields_new_name) 5170 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5171 log.warning( 5172 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5173 ) 5174 5175 # Add rename info 5176 run_parallel_commands( 5177 [ 5178 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5179 ], 5180 1, 5181 ) 5182 5183 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5184 log.debug("annotation_list: " + str(annotation_list)) 5185 5186 # protocol 5187 protocol = annotation 5188 5189 # argument 5190 argument = "" 5191 5192 # operation 5193 operation = "f" 5194 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5195 "ensGene" 5196 ): 5197 operation = "g" 5198 if options.get("genebase", None): 5199 argument = f"""'{options.get("genebase","")}'""" 5200 elif annotation in ["cytoBand"]: 5201 operation = "r" 5202 5203 # argument option 5204 argument_option = "" 5205 if argument != "": 5206 argument_option = " --argument " + argument 5207 5208 # command options 5209 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5210 for option in options: 5211 if option not in ["genebase"]: 5212 command_options += f""" --{option}={options[option]}""" 5213 5214 # Command 5215 5216 # Command - Annovar 5217 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5218 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5219 5220 # Command - start pipe 5221 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5222 5223 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5224 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5225 5226 # Command - Special characters (refGene annotation) 5227 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5228 5229 # Command - Clean empty fields (with value ".") 5230 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5231 5232 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5233 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5234 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5235 # for ann in annotation_renamed_list: 5236 for ann in annotation_list: 5237 annovar_fields_to_keep.append(f"^INFO/{ann}") 5238 5239 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5240 5241 # Command - indexing 5242 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5243 5244 log.debug(f"Annotation - Annovar command: {command_annovar}") 5245 run_parallel_commands([command_annovar], 1) 5246 5247 # Error messages 5248 log.info(f"Error/Warning messages:") 5249 error_message_command_all = [] 5250 error_message_command_warning = [] 5251 error_message_command_err = [] 5252 for err_file in err_files: 5253 with open(err_file, "r") as f: 5254 for line in f: 5255 message = line.strip() 5256 error_message_command_all.append(message) 5257 if line.startswith("[W::") or line.startswith("WARNING"): 5258 error_message_command_warning.append(message) 5259 if line.startswith("[E::") or line.startswith("ERROR"): 5260 error_message_command_err.append( 5261 f"{err_file}: " + message 5262 ) 5263 # log info 5264 for message in list( 5265 set(error_message_command_err + error_message_command_warning) 5266 ): 5267 log.info(f" {message}") 5268 # debug info 5269 for message in list(set(error_message_command_all)): 5270 log.debug(f" {message}") 5271 # failed 5272 if len(error_message_command_err): 5273 log.error("Annotation failed: Error in commands") 5274 raise ValueError("Annotation failed: Error in commands") 5275 5276 if tmp_annotates_vcf_name_list: 5277 5278 # List of annotated files 5279 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5280 5281 # Tmp file 5282 tmp_annotate_vcf = NamedTemporaryFile( 5283 prefix=self.get_prefix(), 5284 dir=self.get_tmp_dir(), 5285 suffix=".vcf.gz", 5286 delete=False, 5287 ) 5288 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5289 tmp_files.append(tmp_annotate_vcf_name) 5290 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5291 err_files.append(tmp_annotate_vcf_name_err) 5292 tmp_files.append(tmp_annotate_vcf_name_err) 5293 5294 # Command merge 5295 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5296 log.info( 5297 f"Annotation Annovar - Annotation merging " 5298 + str(len(tmp_annotates_vcf_name_list)) 5299 + " annotated files" 5300 ) 5301 log.debug(f"Annotation - merge command: {merge_command}") 5302 run_parallel_commands([merge_command], 1) 5303 5304 # Find annotation in header 5305 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5306 header_list = self.read_vcf_header(f) 5307 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5308 5309 for ann in annovar_vcf_header.infos: 5310 if ann not in self.get_header().infos: 5311 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5312 5313 # Update variants 5314 log.info(f"Annotation Annovar - Updating...") 5315 self.update_from_vcf(tmp_annotate_vcf_name) 5316 5317 # Clean files 5318 # Tmp file remove command 5319 if True: 5320 tmp_files_remove_command = "" 5321 if tmp_files: 5322 tmp_files_remove_command = " ".join(tmp_files) 5323 clean_command = f" rm -f {tmp_files_remove_command} " 5324 log.debug(f"Annotation Annovar - Annotation cleaning ") 5325 log.debug(f"Annotation - cleaning command: {clean_command}") 5326 run_parallel_commands([clean_command], 1) 5327 5328 # Parquet 5329 def annotation_parquet(self, threads: int = None) -> None: 5330 """ 5331 It takes a VCF file, and annotates it with a parquet file 5332 5333 :param threads: number of threads to use for the annotation 5334 :return: the value of the variable "result". 5335 """ 5336 5337 # DEBUG 5338 log.debug("Start annotation with parquet databases") 5339 5340 # Threads 5341 if not threads: 5342 threads = self.get_threads() 5343 log.debug("Threads: " + str(threads)) 5344 5345 # DEBUG 5346 delete_tmp = True 5347 if self.get_config().get("verbosity", "warning") in ["debug"]: 5348 delete_tmp = False 5349 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5350 5351 # Config 5352 databases_folders = set( 5353 self.get_config() 5354 .get("folders", {}) 5355 .get("databases", {}) 5356 .get("annotations", ["."]) 5357 + self.get_config() 5358 .get("folders", {}) 5359 .get("databases", {}) 5360 .get("parquet", ["."]) 5361 ) 5362 log.debug("Databases annotations: " + str(databases_folders)) 5363 5364 # Param 5365 annotations = ( 5366 self.get_param() 5367 .get("annotation", {}) 5368 .get("parquet", {}) 5369 .get("annotations", None) 5370 ) 5371 log.debug("Annotations: " + str(annotations)) 5372 5373 # Assembly 5374 assembly = self.get_param().get( 5375 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5376 ) 5377 5378 # Force Update Annotation 5379 force_update_annotation = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("options", {}) 5383 .get("annotations_update", False) 5384 ) 5385 log.debug(f"force_update_annotation={force_update_annotation}") 5386 force_append_annotation = ( 5387 self.get_param() 5388 .get("annotation", {}) 5389 .get("options", {}) 5390 .get("annotations_append", False) 5391 ) 5392 log.debug(f"force_append_annotation={force_append_annotation}") 5393 5394 # Data 5395 table_variants = self.get_table_variants() 5396 5397 # Check if not empty 5398 log.debug("Check if not empty") 5399 sql_query_chromosomes_df = self.get_query_to_df( 5400 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5401 ) 5402 if not sql_query_chromosomes_df["count"][0]: 5403 log.info(f"VCF empty") 5404 return 5405 5406 # VCF header 5407 vcf_reader = self.get_header() 5408 log.debug("Initial header: " + str(vcf_reader.infos)) 5409 5410 # Nb Variants POS 5411 log.debug("NB Variants Start") 5412 nb_variants = self.conn.execute( 5413 f"SELECT count(*) AS count FROM variants" 5414 ).fetchdf()["count"][0] 5415 log.debug("NB Variants Stop") 5416 5417 # Existing annotations 5418 for vcf_annotation in self.get_header().infos: 5419 5420 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5421 log.debug( 5422 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5423 ) 5424 5425 # Added columns 5426 added_columns = [] 5427 5428 # drop indexes 5429 log.debug(f"Drop indexes...") 5430 self.drop_indexes() 5431 5432 if annotations: 5433 5434 if "ALL" in annotations: 5435 5436 all_param = annotations.get("ALL", {}) 5437 all_param_formats = all_param.get("formats", None) 5438 all_param_releases = all_param.get("releases", None) 5439 5440 databases_infos_dict = self.scan_databases( 5441 database_formats=all_param_formats, 5442 database_releases=all_param_releases, 5443 ) 5444 for database_infos in databases_infos_dict.keys(): 5445 if database_infos not in annotations: 5446 annotations[database_infos] = {"INFO": None} 5447 5448 for annotation in annotations: 5449 5450 if annotation in ["ALL"]: 5451 continue 5452 5453 # Annotation Name 5454 annotation_name = os.path.basename(annotation) 5455 5456 # Annotation fields 5457 annotation_fields = annotations[annotation] 5458 if not annotation_fields: 5459 annotation_fields = {"INFO": None} 5460 5461 log.debug(f"Annotation '{annotation_name}'") 5462 log.debug( 5463 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5464 ) 5465 5466 # Create Database 5467 database = Database( 5468 database=annotation, 5469 databases_folders=databases_folders, 5470 assembly=assembly, 5471 ) 5472 5473 # Find files 5474 parquet_file = database.get_database() 5475 parquet_hdr_file = database.get_header_file() 5476 parquet_type = database.get_type() 5477 5478 # Check if files exists 5479 if not parquet_file or not parquet_hdr_file: 5480 log.error("Annotation failed: file not found") 5481 raise ValueError("Annotation failed: file not found") 5482 else: 5483 # Get parquet connexion 5484 parquet_sql_attach = database.get_sql_database_attach( 5485 output="query" 5486 ) 5487 if parquet_sql_attach: 5488 self.conn.execute(parquet_sql_attach) 5489 parquet_file_link = database.get_sql_database_link() 5490 # Log 5491 log.debug( 5492 f"Annotation '{annotation_name}' - file: " 5493 + str(parquet_file) 5494 + " and " 5495 + str(parquet_hdr_file) 5496 ) 5497 5498 # Database full header columns 5499 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5500 parquet_hdr_file 5501 ) 5502 # Log 5503 log.debug( 5504 "Annotation database header columns : " 5505 + str(parquet_hdr_vcf_header_columns) 5506 ) 5507 5508 # Load header as VCF object 5509 parquet_hdr_vcf_header_infos = database.get_header().infos 5510 # Log 5511 log.debug( 5512 "Annotation database header: " 5513 + str(parquet_hdr_vcf_header_infos) 5514 ) 5515 5516 # Get extra infos 5517 parquet_columns = database.get_extra_columns() 5518 # Log 5519 log.debug("Annotation database Columns: " + str(parquet_columns)) 5520 5521 # Add extra columns if "ALL" in annotation_fields 5522 # if "ALL" in annotation_fields: 5523 # allow_add_extra_column = True 5524 if "ALL" in annotation_fields and database.get_extra_columns(): 5525 for extra_column in database.get_extra_columns(): 5526 if ( 5527 extra_column not in annotation_fields 5528 and extra_column.replace("INFO/", "") 5529 not in parquet_hdr_vcf_header_infos 5530 ): 5531 parquet_hdr_vcf_header_infos[extra_column] = ( 5532 vcf.parser._Info( 5533 extra_column, 5534 ".", 5535 "String", 5536 f"{extra_column} description", 5537 "unknown", 5538 "unknown", 5539 self.code_type_map["String"], 5540 ) 5541 ) 5542 5543 # For all fields in database 5544 annotation_fields_all = False 5545 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5546 annotation_fields_all = True 5547 annotation_fields = { 5548 key: key for key in parquet_hdr_vcf_header_infos 5549 } 5550 5551 log.debug( 5552 "Annotation database header - All annotations added: " 5553 + str(annotation_fields) 5554 ) 5555 5556 # Init 5557 5558 # List of annotation fields to use 5559 sql_query_annotation_update_info_sets = [] 5560 5561 # List of annotation to agregate 5562 sql_query_annotation_to_agregate = [] 5563 5564 # Number of fields 5565 nb_annotation_field = 0 5566 5567 # Annotation fields processed 5568 annotation_fields_processed = [] 5569 5570 # Columns mapping 5571 map_columns = database.map_columns( 5572 columns=annotation_fields, prefixes=["INFO/"] 5573 ) 5574 5575 # Query dict for fields to remove (update option) 5576 query_dict_remove = {} 5577 5578 # Fetch Anotation fields 5579 for annotation_field in annotation_fields: 5580 5581 # annotation_field_column 5582 annotation_field_column = map_columns.get( 5583 annotation_field, "INFO" 5584 ) 5585 5586 # field new name, if parametered 5587 annotation_fields_new_name = annotation_fields.get( 5588 annotation_field, annotation_field 5589 ) 5590 if not annotation_fields_new_name: 5591 annotation_fields_new_name = annotation_field 5592 5593 # To annotate 5594 # force_update_annotation = True 5595 # force_append_annotation = True 5596 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5597 if annotation_field in parquet_hdr_vcf_header_infos and ( 5598 force_update_annotation 5599 or force_append_annotation 5600 or ( 5601 annotation_fields_new_name 5602 not in self.get_header().infos 5603 ) 5604 ): 5605 5606 # Add field to annotation to process list 5607 annotation_fields_processed.append( 5608 annotation_fields_new_name 5609 ) 5610 5611 # explode infos for the field 5612 annotation_fields_new_name_info_msg = "" 5613 if ( 5614 force_update_annotation 5615 and annotation_fields_new_name 5616 in self.get_header().infos 5617 ): 5618 # Remove field from INFO 5619 query = f""" 5620 UPDATE {table_variants} as table_variants 5621 SET INFO = REGEXP_REPLACE( 5622 concat(table_variants.INFO,''), 5623 ';*{annotation_fields_new_name}=[^;]*', 5624 '' 5625 ) 5626 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5627 """ 5628 annotation_fields_new_name_info_msg = " [update]" 5629 query_dict_remove[ 5630 f"remove 'INFO/{annotation_fields_new_name}'" 5631 ] = query 5632 5633 # Sep between fields in INFO 5634 nb_annotation_field += 1 5635 if nb_annotation_field > 1: 5636 annotation_field_sep = ";" 5637 else: 5638 annotation_field_sep = "" 5639 5640 log.info( 5641 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5642 ) 5643 5644 # Add INFO field to header 5645 parquet_hdr_vcf_header_infos_number = ( 5646 parquet_hdr_vcf_header_infos[annotation_field].num 5647 or "." 5648 ) 5649 parquet_hdr_vcf_header_infos_type = ( 5650 parquet_hdr_vcf_header_infos[annotation_field].type 5651 or "String" 5652 ) 5653 parquet_hdr_vcf_header_infos_description = ( 5654 parquet_hdr_vcf_header_infos[annotation_field].desc 5655 or f"{annotation_field} description" 5656 ) 5657 parquet_hdr_vcf_header_infos_source = ( 5658 parquet_hdr_vcf_header_infos[annotation_field].source 5659 or "unknown" 5660 ) 5661 parquet_hdr_vcf_header_infos_version = ( 5662 parquet_hdr_vcf_header_infos[annotation_field].version 5663 or "unknown" 5664 ) 5665 5666 vcf_reader.infos[annotation_fields_new_name] = ( 5667 vcf.parser._Info( 5668 annotation_fields_new_name, 5669 parquet_hdr_vcf_header_infos_number, 5670 parquet_hdr_vcf_header_infos_type, 5671 parquet_hdr_vcf_header_infos_description, 5672 parquet_hdr_vcf_header_infos_source, 5673 parquet_hdr_vcf_header_infos_version, 5674 self.code_type_map[ 5675 parquet_hdr_vcf_header_infos_type 5676 ], 5677 ) 5678 ) 5679 5680 # Append 5681 if force_append_annotation: 5682 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5683 else: 5684 query_case_when_append = "" 5685 5686 # Annotation/Update query fields 5687 # Found in INFO column 5688 if ( 5689 annotation_field_column == "INFO" 5690 and "INFO" in parquet_hdr_vcf_header_columns 5691 ): 5692 sql_query_annotation_update_info_sets.append( 5693 f""" 5694 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5695 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5696 ELSE '' 5697 END 5698 """ 5699 ) 5700 # Found in a specific column 5701 else: 5702 sql_query_annotation_update_info_sets.append( 5703 f""" 5704 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5705 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5706 ELSE '' 5707 END 5708 """ 5709 ) 5710 sql_query_annotation_to_agregate.append( 5711 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5712 ) 5713 5714 # Not to annotate 5715 else: 5716 5717 if force_update_annotation: 5718 annotation_message = "forced" 5719 else: 5720 annotation_message = "skipped" 5721 5722 if annotation_field not in parquet_hdr_vcf_header_infos: 5723 log.warning( 5724 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5725 ) 5726 if annotation_fields_new_name in self.get_header().infos: 5727 log.warning( 5728 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5729 ) 5730 5731 # Check if ALL fields have to be annotated. Thus concat all INFO field 5732 # allow_annotation_full_info = True 5733 allow_annotation_full_info = not force_append_annotation 5734 5735 if parquet_type in ["regions"]: 5736 allow_annotation_full_info = False 5737 5738 if ( 5739 allow_annotation_full_info 5740 and nb_annotation_field == len(annotation_fields) 5741 and annotation_fields_all 5742 and ( 5743 "INFO" in parquet_hdr_vcf_header_columns 5744 and "INFO" in database.get_extra_columns() 5745 ) 5746 ): 5747 log.debug("Column INFO annotation enabled") 5748 sql_query_annotation_update_info_sets = [] 5749 sql_query_annotation_update_info_sets.append( 5750 f" table_parquet.INFO " 5751 ) 5752 5753 if sql_query_annotation_update_info_sets: 5754 5755 # Annotate 5756 log.info(f"Annotation '{annotation_name}' - Annotation...") 5757 5758 # Join query annotation update info sets for SQL 5759 sql_query_annotation_update_info_sets_sql = ",".join( 5760 sql_query_annotation_update_info_sets 5761 ) 5762 5763 # Check chromosomes list (and variants infos) 5764 sql_query_chromosomes = f""" 5765 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5766 FROM {table_variants} as table_variants 5767 GROUP BY table_variants."#CHROM" 5768 ORDER BY table_variants."#CHROM" 5769 """ 5770 sql_query_chromosomes_df = self.conn.execute( 5771 sql_query_chromosomes 5772 ).df() 5773 sql_query_chromosomes_dict = { 5774 entry["CHROM"]: { 5775 "count": entry["count_variants"], 5776 "min": entry["min_variants"], 5777 "max": entry["max_variants"], 5778 } 5779 for index, entry in sql_query_chromosomes_df.iterrows() 5780 } 5781 5782 # Init 5783 nb_of_query = 0 5784 nb_of_variant_annotated = 0 5785 query_dict = query_dict_remove 5786 5787 # for chrom in sql_query_chromosomes_df["CHROM"]: 5788 for chrom in sql_query_chromosomes_dict: 5789 5790 # Number of variant by chromosome 5791 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5792 chrom, {} 5793 ).get("count", 0) 5794 5795 log.debug( 5796 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5797 ) 5798 5799 # Annotation with regions database 5800 if parquet_type in ["regions"]: 5801 sql_query_annotation_from_clause = f""" 5802 FROM ( 5803 SELECT 5804 '{chrom}' AS \"#CHROM\", 5805 table_variants_from.\"POS\" AS \"POS\", 5806 {",".join(sql_query_annotation_to_agregate)} 5807 FROM {table_variants} as table_variants_from 5808 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5809 table_parquet_from."#CHROM" = '{chrom}' 5810 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5811 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5812 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5813 ) 5814 ) 5815 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5816 GROUP BY table_variants_from.\"POS\" 5817 ) 5818 as table_parquet 5819 """ 5820 5821 sql_query_annotation_where_clause = """ 5822 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5823 AND table_parquet.\"POS\" = table_variants.\"POS\" 5824 """ 5825 5826 # Annotation with variants database 5827 else: 5828 sql_query_annotation_from_clause = f""" 5829 FROM {parquet_file_link} as table_parquet 5830 """ 5831 sql_query_annotation_where_clause = f""" 5832 table_variants."#CHROM" = '{chrom}' 5833 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5834 AND table_parquet.\"POS\" = table_variants.\"POS\" 5835 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5836 AND table_parquet.\"REF\" = table_variants.\"REF\" 5837 """ 5838 5839 # Create update query 5840 sql_query_annotation_chrom_interval_pos = f""" 5841 UPDATE {table_variants} as table_variants 5842 SET INFO = 5843 concat( 5844 CASE WHEN table_variants.INFO NOT IN ('','.') 5845 THEN table_variants.INFO 5846 ELSE '' 5847 END 5848 , 5849 CASE WHEN table_variants.INFO NOT IN ('','.') 5850 AND ( 5851 concat({sql_query_annotation_update_info_sets_sql}) 5852 ) 5853 NOT IN ('','.') 5854 THEN ';' 5855 ELSE '' 5856 END 5857 , 5858 {sql_query_annotation_update_info_sets_sql} 5859 ) 5860 {sql_query_annotation_from_clause} 5861 WHERE {sql_query_annotation_where_clause} 5862 ; 5863 """ 5864 5865 # Add update query to dict 5866 query_dict[ 5867 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5868 ] = sql_query_annotation_chrom_interval_pos 5869 5870 nb_of_query = len(query_dict) 5871 num_query = 0 5872 5873 # SET max_expression_depth TO x 5874 self.conn.execute("SET max_expression_depth TO 10000") 5875 5876 for query_name in query_dict: 5877 query = query_dict[query_name] 5878 num_query += 1 5879 log.info( 5880 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5881 ) 5882 result = self.conn.execute(query) 5883 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5884 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5885 log.info( 5886 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5887 ) 5888 5889 log.info( 5890 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5891 ) 5892 5893 else: 5894 5895 log.info( 5896 f"Annotation '{annotation_name}' - No Annotations available" 5897 ) 5898 5899 log.debug("Final header: " + str(vcf_reader.infos)) 5900 5901 # Remove added columns 5902 for added_column in added_columns: 5903 self.drop_column(column=added_column) 5904 5905 def annotation_splice(self, threads: int = None) -> None: 5906 """ 5907 This function annotate with snpEff 5908 5909 :param threads: The number of threads to use 5910 :return: the value of the variable "return_value". 5911 """ 5912 5913 # DEBUG 5914 log.debug("Start annotation with splice tools") 5915 5916 # Threads 5917 if not threads: 5918 threads = self.get_threads() 5919 log.debug("Threads: " + str(threads)) 5920 5921 # DEBUG 5922 delete_tmp = True 5923 if self.get_config().get("verbosity", "warning") in ["debug"]: 5924 delete_tmp = False 5925 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5926 5927 # Config 5928 config = self.get_config() 5929 log.debug("Config: " + str(config)) 5930 splice_config = config.get("tools", {}).get("splice", {}) 5931 if not splice_config: 5932 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5933 if not splice_config: 5934 msg_err = "No Splice tool config" 5935 log.error(msg_err) 5936 raise ValueError(msg_err) 5937 log.debug(f"splice_config={splice_config}") 5938 5939 # Config - Folders - Databases 5940 databases_folders = ( 5941 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5942 ) 5943 log.debug("Databases annotations: " + str(databases_folders)) 5944 5945 # Splice docker image 5946 splice_docker_image = splice_config.get("docker").get("image") 5947 5948 # Pull splice image if it's not already there 5949 if not check_docker_image_exists(splice_docker_image): 5950 log.warning( 5951 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5952 ) 5953 try: 5954 command(f"docker pull {splice_config.get('docker').get('image')}") 5955 except subprocess.CalledProcessError: 5956 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5957 log.error(msg_err) 5958 raise ValueError(msg_err) 5959 return None 5960 5961 # Config - splice databases 5962 splice_databases = ( 5963 config.get("folders", {}) 5964 .get("databases", {}) 5965 .get("splice", DEFAULT_SPLICE_FOLDER) 5966 ) 5967 splice_databases = full_path(splice_databases) 5968 5969 # Param 5970 param = self.get_param() 5971 log.debug("Param: " + str(param)) 5972 5973 # Param 5974 options = param.get("annotation", {}).get("splice", {}) 5975 log.debug("Options: " + str(options)) 5976 5977 # Data 5978 table_variants = self.get_table_variants() 5979 5980 # Check if not empty 5981 log.debug("Check if not empty") 5982 sql_query_chromosomes = ( 5983 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5984 ) 5985 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5986 log.info("VCF empty") 5987 return None 5988 5989 # Export in VCF 5990 log.debug("Create initial file to annotate") 5991 5992 # Create output folder 5993 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5994 if not os.path.exists(output_folder): 5995 Path(output_folder).mkdir(parents=True, exist_ok=True) 5996 5997 # Create tmp VCF file 5998 tmp_vcf = NamedTemporaryFile( 5999 prefix=self.get_prefix(), 6000 dir=output_folder, 6001 suffix=".vcf", 6002 delete=False, 6003 ) 6004 tmp_vcf_name = tmp_vcf.name 6005 6006 # VCF header 6007 header = self.get_header() 6008 6009 # Existing annotations 6010 for vcf_annotation in self.get_header().infos: 6011 6012 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6013 log.debug( 6014 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6015 ) 6016 6017 # Memory limit 6018 if config.get("memory", None): 6019 memory_limit = config.get("memory", "8G").upper() 6020 # upper() 6021 else: 6022 memory_limit = "8G" 6023 log.debug(f"memory_limit: {memory_limit}") 6024 6025 # Check number of variants to annotate 6026 where_clause_regex_spliceai = r"SpliceAI_\w+" 6027 where_clause_regex_spip = r"SPiP_\w+" 6028 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6029 df_list_of_variants_to_annotate = self.get_query_to_df( 6030 query=f""" SELECT * FROM variants {where_clause} """ 6031 ) 6032 if len(df_list_of_variants_to_annotate) == 0: 6033 log.warning( 6034 f"No variants to annotate with splice. Variants probably already annotated with splice" 6035 ) 6036 return None 6037 else: 6038 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6039 6040 # Export VCF file 6041 self.export_variant_vcf( 6042 vcf_file=tmp_vcf_name, 6043 remove_info=True, 6044 add_samples=True, 6045 index=False, 6046 where_clause=where_clause, 6047 ) 6048 6049 # Create docker container and launch splice analysis 6050 if splice_config: 6051 6052 # Splice mount folders 6053 mount_folders = splice_config.get("mount", {}) 6054 6055 # Genome mount 6056 mount_folders[ 6057 config.get("folders", {}) 6058 .get("databases", {}) 6059 .get("genomes", DEFAULT_GENOME_FOLDER) 6060 ] = "ro" 6061 6062 # SpliceAI mount 6063 mount_folders[ 6064 config.get("folders", {}) 6065 .get("databases", {}) 6066 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6067 ] = "ro" 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("spip", DEFAULT_SPIP_FOLDER) 6074 ] = "ro" 6075 6076 # Mount folders 6077 mount = [] 6078 6079 # Config mount 6080 mount = [ 6081 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6082 for path, mode in mount_folders.items() 6083 ] 6084 6085 if any(value for value in splice_config.values() if value is None): 6086 log.warning("At least one splice config parameter is empty") 6087 return None 6088 6089 # Params in splice nf 6090 def check_values(dico: dict): 6091 """ 6092 Ensure parameters for NF splice pipeline 6093 """ 6094 for key, val in dico.items(): 6095 if key == "genome": 6096 if any( 6097 assemb in options.get("genome", {}) 6098 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6099 ): 6100 yield f"--{key} hg19" 6101 elif any( 6102 assemb in options.get("genome", {}) 6103 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6104 ): 6105 yield f"--{key} hg38" 6106 elif ( 6107 (isinstance(val, str) and val) 6108 or isinstance(val, int) 6109 or isinstance(val, bool) 6110 ): 6111 yield f"--{key} {val}" 6112 6113 # Genome 6114 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6115 options["genome"] = genome 6116 6117 # NF params 6118 nf_params = [] 6119 6120 # Add options 6121 if options: 6122 nf_params = list(check_values(options)) 6123 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6124 else: 6125 log.debug("No NF params provided") 6126 6127 # Add threads 6128 if "threads" not in options.keys(): 6129 nf_params.append(f"--threads {threads}") 6130 6131 # Genome path 6132 genome_path = find_genome( 6133 config.get("folders", {}) 6134 .get("databases", {}) 6135 .get("genomes", DEFAULT_GENOME_FOLDER), 6136 file=f"{genome}.fa", 6137 ) 6138 # Add genome path 6139 if not genome_path: 6140 raise ValueError( 6141 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6142 ) 6143 else: 6144 log.debug(f"Genome: {genome_path}") 6145 nf_params.append(f"--genome_path {genome_path}") 6146 6147 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6148 """ 6149 Setting up updated databases for SPiP and SpliceAI 6150 """ 6151 6152 try: 6153 6154 # SpliceAI assembly transcriptome 6155 spliceai_assembly = os.path.join( 6156 config.get("folders", {}) 6157 .get("databases", {}) 6158 .get("spliceai", {}), 6159 options.get("genome"), 6160 "transcriptome", 6161 ) 6162 spip_assembly = options.get("genome") 6163 6164 spip = find( 6165 f"transcriptome_{spip_assembly}.RData", 6166 config.get("folders", {}).get("databases", {}).get("spip", {}), 6167 ) 6168 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6169 log.debug(f"SPiP annotations: {spip}") 6170 log.debug(f"SpliceAI annotations: {spliceai}") 6171 if spip and spliceai: 6172 return [ 6173 f"--spip_transcriptome {spip}", 6174 f"--spliceai_annotations {spliceai}", 6175 ] 6176 else: 6177 # TODO crash and go on with basic annotations ? 6178 # raise ValueError( 6179 # "Can't find splice databases in configuration EXIT" 6180 # ) 6181 log.warning( 6182 "Can't find splice databases in configuration, use annotations file from image" 6183 ) 6184 except TypeError: 6185 log.warning( 6186 "Can't find splice databases in configuration, use annotations file from image" 6187 ) 6188 return [] 6189 6190 # Add options, check if transcriptome option have already beend provided 6191 if ( 6192 "spip_transcriptome" not in nf_params 6193 and "spliceai_transcriptome" not in nf_params 6194 ): 6195 splice_reference = splice_annotations(options, config) 6196 if splice_reference: 6197 nf_params.extend(splice_reference) 6198 6199 nf_params.append(f"--output_folder {output_folder}") 6200 6201 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6202 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6203 log.debug(cmd) 6204 6205 splice_config["docker"]["command"] = cmd 6206 6207 docker_cmd = get_bin_command( 6208 tool="splice", 6209 bin_type="docker", 6210 config=config, 6211 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6212 add_options=f"--name {random_uuid} {' '.join(mount)}", 6213 ) 6214 6215 # Docker debug 6216 # if splice_config.get("rm_container"): 6217 # rm_container = "--rm" 6218 # else: 6219 # rm_container = "" 6220 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6221 6222 log.debug(docker_cmd) 6223 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6224 log.debug(res.stdout) 6225 if res.stderr: 6226 log.error(res.stderr) 6227 res.check_returncode() 6228 else: 6229 log.warning(f"Splice tool configuration not found: {config}") 6230 6231 # Update variants 6232 log.info("Annotation - Updating...") 6233 # Test find output vcf 6234 log.debug( 6235 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6236 ) 6237 output_vcf = [] 6238 # Wrong folder to look in 6239 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6240 if ( 6241 files 6242 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6243 ): 6244 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6245 # log.debug(os.listdir(options.get("output_folder"))) 6246 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6247 if not output_vcf: 6248 log.debug( 6249 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 else: 6252 # Get new header from annotated vcf 6253 log.debug(f"Initial header: {len(header.infos)} fields") 6254 # Create new header with splice infos 6255 new_vcf = Variants(input=output_vcf[0]) 6256 new_vcf_header = new_vcf.get_header().infos 6257 for keys, infos in new_vcf_header.items(): 6258 if keys not in header.infos.keys(): 6259 header.infos[keys] = infos 6260 log.debug(f"New header: {len(header.infos)} fields") 6261 log.debug(f"Splice tmp output: {output_vcf[0]}") 6262 self.update_from_vcf(output_vcf[0]) 6263 6264 # Remove folder 6265 remove_if_exists(output_folder) 6266 6267 ### 6268 # Prioritization 6269 ### 6270 6271 def get_config_default(self, name: str) -> dict: 6272 """ 6273 The function `get_config_default` returns a dictionary containing default configurations for 6274 various calculations and prioritizations. 6275 6276 :param name: The `get_config_default` function returns a dictionary containing default 6277 configurations for different calculations and prioritizations. The `name` parameter is used to 6278 specify which specific configuration to retrieve from the dictionary 6279 :type name: str 6280 :return: The function `get_config_default` returns a dictionary containing default configuration 6281 settings for different calculations and prioritizations. The specific configuration settings are 6282 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6283 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6284 returned. If there is no match, an empty dictionary is returned. 6285 """ 6286 6287 config_default = { 6288 "calculations": { 6289 "variant_chr_pos_alt_ref": { 6290 "type": "sql", 6291 "name": "variant_chr_pos_alt_ref", 6292 "description": "Create a variant ID with chromosome, position, alt and ref", 6293 "available": False, 6294 "output_column_name": "variant_chr_pos_alt_ref", 6295 "output_column_type": "String", 6296 "output_column_description": "variant ID with chromosome, position, alt and ref", 6297 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6298 "operation_info": True, 6299 }, 6300 "VARTYPE": { 6301 "type": "sql", 6302 "name": "VARTYPE", 6303 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6304 "available": True, 6305 "output_column_name": "VARTYPE", 6306 "output_column_type": "String", 6307 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6308 "operation_query": """ 6309 CASE 6310 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6311 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6312 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6313 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6314 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6315 ELSE 'UNDEFINED' 6316 END 6317 """, 6318 "info_fields": ["SVTYPE"], 6319 "operation_info": True, 6320 }, 6321 "snpeff_hgvs": { 6322 "type": "python", 6323 "name": "snpeff_hgvs", 6324 "description": "HGVS nomenclatures from snpEff annotation", 6325 "available": True, 6326 "function_name": "calculation_extract_snpeff_hgvs", 6327 "function_params": ["snpeff_hgvs", "ANN"], 6328 }, 6329 "snpeff_ann_explode": { 6330 "type": "python", 6331 "name": "snpeff_ann_explode", 6332 "description": "Explode snpEff annotations with uniquify values", 6333 "available": True, 6334 "function_name": "calculation_snpeff_ann_explode", 6335 "function_params": [False, "fields", "snpeff_", "ANN"], 6336 }, 6337 "snpeff_ann_explode_uniquify": { 6338 "type": "python", 6339 "name": "snpeff_ann_explode_uniquify", 6340 "description": "Explode snpEff annotations", 6341 "available": True, 6342 "function_name": "calculation_snpeff_ann_explode", 6343 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6344 }, 6345 "snpeff_ann_explode_json": { 6346 "type": "python", 6347 "name": "snpeff_ann_explode_json", 6348 "description": "Explode snpEff annotations in JSON format", 6349 "available": True, 6350 "function_name": "calculation_snpeff_ann_explode", 6351 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6352 }, 6353 "NOMEN": { 6354 "type": "python", 6355 "name": "NOMEN", 6356 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6357 "available": True, 6358 "function_name": "calculation_extract_nomen", 6359 "function_params": [], 6360 }, 6361 "FINDBYPIPELINE": { 6362 "type": "python", 6363 "name": "FINDBYPIPELINE", 6364 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6365 "available": True, 6366 "function_name": "calculation_find_by_pipeline", 6367 "function_params": ["findbypipeline"], 6368 }, 6369 "FINDBYSAMPLE": { 6370 "type": "python", 6371 "name": "FINDBYSAMPLE", 6372 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6373 "available": True, 6374 "function_name": "calculation_find_by_pipeline", 6375 "function_params": ["findbysample"], 6376 }, 6377 "GENOTYPECONCORDANCE": { 6378 "type": "python", 6379 "name": "GENOTYPECONCORDANCE", 6380 "description": "Concordance of genotype for multi caller VCF", 6381 "available": True, 6382 "function_name": "calculation_genotype_concordance", 6383 "function_params": [], 6384 }, 6385 "BARCODE": { 6386 "type": "python", 6387 "name": "BARCODE", 6388 "description": "BARCODE as VaRank tool", 6389 "available": True, 6390 "function_name": "calculation_barcode", 6391 "function_params": [], 6392 }, 6393 "BARCODEFAMILY": { 6394 "type": "python", 6395 "name": "BARCODEFAMILY", 6396 "description": "BARCODEFAMILY as VaRank tool", 6397 "available": True, 6398 "function_name": "calculation_barcode_family", 6399 "function_params": ["BCF"], 6400 }, 6401 "TRIO": { 6402 "type": "python", 6403 "name": "TRIO", 6404 "description": "Inheritance for a trio family", 6405 "available": True, 6406 "function_name": "calculation_trio", 6407 "function_params": [], 6408 }, 6409 "VAF": { 6410 "type": "python", 6411 "name": "VAF", 6412 "description": "Variant Allele Frequency (VAF) harmonization", 6413 "available": True, 6414 "function_name": "calculation_vaf_normalization", 6415 "function_params": [], 6416 }, 6417 "VAF_stats": { 6418 "type": "python", 6419 "name": "VAF_stats", 6420 "description": "Variant Allele Frequency (VAF) statistics", 6421 "available": True, 6422 "function_name": "calculation_genotype_stats", 6423 "function_params": ["VAF"], 6424 }, 6425 "DP_stats": { 6426 "type": "python", 6427 "name": "DP_stats", 6428 "description": "Depth (DP) statistics", 6429 "available": True, 6430 "function_name": "calculation_genotype_stats", 6431 "function_params": ["DP"], 6432 }, 6433 "variant_id": { 6434 "type": "python", 6435 "name": "variant_id", 6436 "description": "Variant ID generated from variant position and type", 6437 "available": True, 6438 "function_name": "calculation_variant_id", 6439 "function_params": [], 6440 }, 6441 "transcripts_json": { 6442 "type": "python", 6443 "name": "transcripts_json", 6444 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6445 "available": True, 6446 "function_name": "calculation_transcripts_json", 6447 "function_params": ["transcripts_json"], 6448 }, 6449 }, 6450 "prioritizations": { 6451 "default": { 6452 "filter": [ 6453 { 6454 "type": "notequals", 6455 "value": "!PASS|\\.", 6456 "score": 0, 6457 "flag": "FILTERED", 6458 "comment": ["Bad variant quality"], 6459 }, 6460 { 6461 "type": "equals", 6462 "value": "REJECT", 6463 "score": -20, 6464 "flag": "PASS", 6465 "comment": ["Bad variant quality"], 6466 }, 6467 ], 6468 "DP": [ 6469 { 6470 "type": "gte", 6471 "value": "50", 6472 "score": 5, 6473 "flag": "PASS", 6474 "comment": ["DP higher than 50"], 6475 } 6476 ], 6477 "ANN": [ 6478 { 6479 "type": "contains", 6480 "value": "HIGH", 6481 "score": 5, 6482 "flag": "PASS", 6483 "comment": [ 6484 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6485 ], 6486 }, 6487 { 6488 "type": "contains", 6489 "value": "MODERATE", 6490 "score": 3, 6491 "flag": "PASS", 6492 "comment": [ 6493 "A non-disruptive variant that might change protein effectiveness" 6494 ], 6495 }, 6496 { 6497 "type": "contains", 6498 "value": "LOW", 6499 "score": 0, 6500 "flag": "FILTERED", 6501 "comment": [ 6502 "Assumed to be mostly harmless or unlikely to change protein behavior" 6503 ], 6504 }, 6505 { 6506 "type": "contains", 6507 "value": "MODIFIER", 6508 "score": 0, 6509 "flag": "FILTERED", 6510 "comment": [ 6511 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6512 ], 6513 }, 6514 ], 6515 } 6516 }, 6517 } 6518 6519 return config_default.get(name, None) 6520 6521 def get_config_json( 6522 self, name: str, config_dict: dict = {}, config_file: str = None 6523 ) -> dict: 6524 """ 6525 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6526 default values, a dictionary, and a file. 6527 6528 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6529 the name of the configuration. It is used to identify and retrieve the configuration settings 6530 for a specific component or module 6531 :type name: str 6532 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6533 dictionary that allows you to provide additional configuration settings or overrides. When you 6534 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6535 the key is the configuration setting you want to override or 6536 :type config_dict: dict 6537 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6538 specify the path to a configuration file that contains additional settings. If provided, the 6539 function will read the contents of this file and update the configuration dictionary with the 6540 values found in the file, overriding any existing values with the 6541 :type config_file: str 6542 :return: The function `get_config_json` returns a dictionary containing the configuration 6543 settings. 6544 """ 6545 6546 # Create with default prioritizations 6547 config_default = self.get_config_default(name=name) 6548 configuration = config_default 6549 # log.debug(f"configuration={configuration}") 6550 6551 # Replace prioritizations from dict 6552 for config in config_dict: 6553 configuration[config] = config_dict[config] 6554 6555 # Replace prioritizations from file 6556 config_file = full_path(config_file) 6557 if config_file: 6558 if os.path.exists(config_file): 6559 with open(config_file) as config_file_content: 6560 config_file_dict = json.load(config_file_content) 6561 for config in config_file_dict: 6562 configuration[config] = config_file_dict[config] 6563 else: 6564 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6565 log.error(msg_error) 6566 raise ValueError(msg_error) 6567 6568 return configuration 6569 6570 def prioritization(self) -> None: 6571 """ 6572 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6573 INFO fields 6574 """ 6575 6576 # Config 6577 config = self.get_config() 6578 6579 # Param 6580 param = self.get_param() 6581 6582 # Quick Prioritizations 6583 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6584 6585 # Configuration profiles 6586 prioritization_config_file = param.get("prioritization", {}).get( 6587 "prioritization_config", None 6588 ) 6589 prioritization_config_file = full_path(prioritization_config_file) 6590 prioritizations_config = self.get_config_json( 6591 name="prioritizations", config_file=prioritization_config_file 6592 ) 6593 6594 # Prioritization options 6595 profiles = param.get("prioritization", {}).get("profiles", []) 6596 if isinstance(profiles, str): 6597 profiles = profiles.split(",") 6598 pzfields = param.get("prioritization", {}).get( 6599 "pzfields", ["PZFlag", "PZScore"] 6600 ) 6601 if isinstance(pzfields, str): 6602 pzfields = pzfields.split(",") 6603 default_profile = param.get("prioritization", {}).get("default_profile", None) 6604 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6605 prioritization_score_mode = param.get("prioritization", {}).get( 6606 "prioritization_score_mode", "HOWARD" 6607 ) 6608 6609 # Quick Prioritizations 6610 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6611 prioritizations = param.get("prioritizations", None) 6612 if prioritizations: 6613 log.info("Quick Prioritization:") 6614 for profile in prioritizations.split(","): 6615 if profile not in profiles: 6616 profiles.append(profile) 6617 log.info(f" {profile}") 6618 6619 # If profile "ALL" provided, all profiles in the config profiles 6620 if "ALL" in profiles: 6621 profiles = list(prioritizations_config.keys()) 6622 6623 for profile in profiles: 6624 if prioritizations_config.get(profile, None): 6625 log.debug(f"Profile '{profile}' configured") 6626 else: 6627 msg_error = f"Profile '{profile}' NOT configured" 6628 log.error(msg_error) 6629 raise ValueError(msg_error) 6630 6631 if profiles: 6632 log.info(f"Prioritization... ") 6633 else: 6634 log.debug(f"No profile defined") 6635 return 6636 6637 if not default_profile and len(profiles): 6638 default_profile = profiles[0] 6639 6640 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6641 log.debug("Profiles to check: " + str(list(profiles))) 6642 6643 # Variables 6644 table_variants = self.get_table_variants(clause="update") 6645 6646 # Added columns 6647 added_columns = [] 6648 6649 # Create list of PZfields 6650 # List of PZFields 6651 list_of_pzfields_original = pzfields + [ 6652 pzfield + pzfields_sep + profile 6653 for pzfield in pzfields 6654 for profile in profiles 6655 ] 6656 list_of_pzfields = [] 6657 log.debug(f"{list_of_pzfields_original}") 6658 6659 # Remove existing PZfields to use if exists 6660 for pzfield in list_of_pzfields_original: 6661 if self.get_header().infos.get(pzfield, None) is None: 6662 list_of_pzfields.append(pzfield) 6663 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6664 else: 6665 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6666 6667 if list_of_pzfields: 6668 6669 # Explode Infos fields 6670 explode_infos_prefix = self.get_explode_infos_prefix() 6671 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6672 extra_infos = self.get_extra_infos() 6673 6674 # PZfields tags description 6675 PZfields_INFOS = { 6676 "PZTags": { 6677 "ID": "PZTags", 6678 "Number": ".", 6679 "Type": "String", 6680 "Description": "Variant tags based on annotation criteria", 6681 }, 6682 "PZScore": { 6683 "ID": "PZScore", 6684 "Number": 1, 6685 "Type": "Integer", 6686 "Description": "Variant score based on annotation criteria", 6687 }, 6688 "PZFlag": { 6689 "ID": "PZFlag", 6690 "Number": 1, 6691 "Type": "String", 6692 "Description": "Variant flag based on annotation criteria", 6693 }, 6694 "PZComment": { 6695 "ID": "PZComment", 6696 "Number": ".", 6697 "Type": "String", 6698 "Description": "Variant comment based on annotation criteria", 6699 }, 6700 "PZInfos": { 6701 "ID": "PZInfos", 6702 "Number": ".", 6703 "Type": "String", 6704 "Description": "Variant infos based on annotation criteria", 6705 }, 6706 } 6707 6708 # Create INFO fields if not exist 6709 for field in PZfields_INFOS: 6710 field_ID = PZfields_INFOS[field]["ID"] 6711 field_description = PZfields_INFOS[field]["Description"] 6712 if field_ID not in self.get_header().infos and field_ID in pzfields: 6713 field_description = ( 6714 PZfields_INFOS[field]["Description"] 6715 + f", profile {default_profile}" 6716 ) 6717 self.get_header().infos[field_ID] = vcf.parser._Info( 6718 field_ID, 6719 PZfields_INFOS[field]["Number"], 6720 PZfields_INFOS[field]["Type"], 6721 field_description, 6722 "unknown", 6723 "unknown", 6724 code_type_map[PZfields_INFOS[field]["Type"]], 6725 ) 6726 6727 # Create INFO fields if not exist for each profile 6728 for profile in prioritizations_config: 6729 if profile in profiles or profiles == []: 6730 for field in PZfields_INFOS: 6731 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6732 field_description = ( 6733 PZfields_INFOS[field]["Description"] 6734 + f", profile {profile}" 6735 ) 6736 if ( 6737 field_ID not in self.get_header().infos 6738 and field in pzfields 6739 ): 6740 self.get_header().infos[field_ID] = vcf.parser._Info( 6741 field_ID, 6742 PZfields_INFOS[field]["Number"], 6743 PZfields_INFOS[field]["Type"], 6744 field_description, 6745 "unknown", 6746 "unknown", 6747 code_type_map[PZfields_INFOS[field]["Type"]], 6748 ) 6749 6750 # Header 6751 for pzfield in list_of_pzfields: 6752 if re.match("PZScore.*", pzfield): 6753 added_column = self.add_column( 6754 table_name=table_variants, 6755 column_name=pzfield, 6756 column_type="INTEGER", 6757 default_value="0", 6758 ) 6759 elif re.match("PZFlag.*", pzfield): 6760 added_column = self.add_column( 6761 table_name=table_variants, 6762 column_name=pzfield, 6763 column_type="BOOLEAN", 6764 default_value="1", 6765 ) 6766 else: 6767 added_column = self.add_column( 6768 table_name=table_variants, 6769 column_name=pzfield, 6770 column_type="STRING", 6771 default_value="''", 6772 ) 6773 added_columns.append(added_column) 6774 6775 # Profiles 6776 if profiles: 6777 6778 # foreach profile in configuration file 6779 for profile in prioritizations_config: 6780 6781 # If profile is asked in param, or ALL are asked (empty profile []) 6782 if profile in profiles or profiles == []: 6783 log.info(f"Profile '{profile}'") 6784 6785 sql_set_info_option = "" 6786 6787 sql_set_info = [] 6788 6789 # PZ fields set 6790 6791 # PZScore 6792 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6793 sql_set_info.append( 6794 f""" 6795 concat( 6796 'PZScore{pzfields_sep}{profile}=', 6797 PZScore{pzfields_sep}{profile} 6798 ) 6799 """ 6800 ) 6801 if ( 6802 profile == default_profile 6803 and "PZScore" in list_of_pzfields 6804 ): 6805 sql_set_info.append( 6806 f""" 6807 concat( 6808 'PZScore=', 6809 PZScore{pzfields_sep}{profile} 6810 ) 6811 """ 6812 ) 6813 6814 # PZFlag 6815 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6816 sql_set_info.append( 6817 f""" 6818 concat( 6819 'PZFlag{pzfields_sep}{profile}=', 6820 CASE 6821 WHEN PZFlag{pzfields_sep}{profile}==1 6822 THEN 'PASS' 6823 WHEN PZFlag{pzfields_sep}{profile}==0 6824 THEN 'FILTERED' 6825 END 6826 ) 6827 """ 6828 ) 6829 if ( 6830 profile == default_profile 6831 and "PZFlag" in list_of_pzfields 6832 ): 6833 sql_set_info.append( 6834 f""" 6835 concat( 6836 'PZFlag=', 6837 CASE 6838 WHEN PZFlag{pzfields_sep}{profile}==1 6839 THEN 'PASS' 6840 WHEN PZFlag{pzfields_sep}{profile}==0 6841 THEN 'FILTERED' 6842 END 6843 ) 6844 """ 6845 ) 6846 6847 # PZComment 6848 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6849 sql_set_info.append( 6850 f""" 6851 CASE 6852 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6853 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6854 ELSE '' 6855 END 6856 """ 6857 ) 6858 if ( 6859 profile == default_profile 6860 and "PZComment" in list_of_pzfields 6861 ): 6862 sql_set_info.append( 6863 f""" 6864 CASE 6865 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6866 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6867 ELSE '' 6868 END 6869 """ 6870 ) 6871 6872 # PZInfos 6873 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6874 sql_set_info.append( 6875 f""" 6876 CASE 6877 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6878 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6879 ELSE '' 6880 END 6881 """ 6882 ) 6883 if ( 6884 profile == default_profile 6885 and "PZInfos" in list_of_pzfields 6886 ): 6887 sql_set_info.append( 6888 f""" 6889 CASE 6890 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6891 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6892 ELSE '' 6893 END 6894 """ 6895 ) 6896 6897 # Merge PZfields 6898 sql_set_info_option = "" 6899 sql_set_sep = "" 6900 for sql_set in sql_set_info: 6901 if sql_set_sep: 6902 sql_set_info_option += f""" 6903 , concat('{sql_set_sep}', {sql_set}) 6904 """ 6905 else: 6906 sql_set_info_option += f""" 6907 , {sql_set} 6908 """ 6909 sql_set_sep = ";" 6910 6911 sql_queries = [] 6912 for annotation in prioritizations_config[profile]: 6913 6914 # Check if annotation field is present 6915 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6916 log.debug(f"Annotation '{annotation}' not in data") 6917 continue 6918 else: 6919 log.debug(f"Annotation '{annotation}' in data") 6920 6921 # For each criterions 6922 for criterion in prioritizations_config[profile][ 6923 annotation 6924 ]: 6925 criterion_type = criterion["type"] 6926 criterion_value = criterion["value"] 6927 criterion_score = criterion.get("score", 0) 6928 criterion_flag = criterion.get("flag", "PASS") 6929 criterion_flag_bool = criterion_flag == "PASS" 6930 criterion_comment = ( 6931 ", ".join(criterion.get("comment", [])) 6932 .replace("'", "''") 6933 .replace(";", ",") 6934 .replace("\t", " ") 6935 ) 6936 criterion_infos = ( 6937 str(criterion) 6938 .replace("'", "''") 6939 .replace(";", ",") 6940 .replace("\t", " ") 6941 ) 6942 6943 sql_set = [] 6944 sql_set_info = [] 6945 6946 # PZ fields set 6947 if ( 6948 f"PZScore{pzfields_sep}{profile}" 6949 in list_of_pzfields 6950 ): 6951 if prioritization_score_mode == "HOWARD": 6952 sql_set.append( 6953 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6954 ) 6955 elif prioritization_score_mode == "VaRank": 6956 sql_set.append( 6957 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6958 ) 6959 else: 6960 sql_set.append( 6961 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6962 ) 6963 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6964 sql_set.append( 6965 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6966 ) 6967 if ( 6968 f"PZComment{pzfields_sep}{profile}" 6969 in list_of_pzfields 6970 ): 6971 sql_set.append( 6972 f""" 6973 PZComment{pzfields_sep}{profile} = 6974 concat( 6975 PZComment{pzfields_sep}{profile}, 6976 CASE 6977 WHEN PZComment{pzfields_sep}{profile}!='' 6978 THEN ', ' 6979 ELSE '' 6980 END, 6981 '{criterion_comment}' 6982 ) 6983 """ 6984 ) 6985 if ( 6986 f"PZInfos{pzfields_sep}{profile}" 6987 in list_of_pzfields 6988 ): 6989 sql_set.append( 6990 f""" 6991 PZInfos{pzfields_sep}{profile} = 6992 concat( 6993 PZInfos{pzfields_sep}{profile}, 6994 '{criterion_infos}' 6995 ) 6996 """ 6997 ) 6998 sql_set_option = ",".join(sql_set) 6999 7000 # Criterion and comparison 7001 try: 7002 float(criterion_value) 7003 sql_update = f""" 7004 UPDATE {table_variants} 7005 SET {sql_set_option} 7006 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7007 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7008 """ 7009 except: 7010 contains_option = "" 7011 if criterion_type == "contains": 7012 contains_option = ".*" 7013 sql_update = f""" 7014 UPDATE {table_variants} 7015 SET {sql_set_option} 7016 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7017 """ 7018 sql_queries.append(sql_update) 7019 7020 # PZTags 7021 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7022 7023 # Create PZFalgs value 7024 pztags_value = "" 7025 pztags_sep_default = "|" 7026 pztags_sep = "" 7027 for pzfield in pzfields: 7028 if pzfield not in ["PZTags"]: 7029 if ( 7030 f"{pzfield}{pzfields_sep}{profile}" 7031 in list_of_pzfields 7032 ): 7033 if pzfield in ["PZFlag"]: 7034 pztags_value += f"""{pztags_sep}{pzfield}#', 7035 CASE WHEN PZFlag{pzfields_sep}{profile} 7036 THEN 'PASS' 7037 ELSE 'FILTERED' 7038 END, '""" 7039 else: 7040 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7041 pztags_sep = pztags_sep_default 7042 7043 # Add Query update for PZFlags 7044 sql_update_pztags = f""" 7045 UPDATE {table_variants} 7046 SET INFO = concat( 7047 INFO, 7048 CASE WHEN INFO NOT in ('','.') 7049 THEN ';' 7050 ELSE '' 7051 END, 7052 'PZTags{pzfields_sep}{profile}={pztags_value}' 7053 ) 7054 """ 7055 sql_queries.append(sql_update_pztags) 7056 7057 # Add Query update for PZFlags for default 7058 if profile == default_profile: 7059 sql_update_pztags_default = f""" 7060 UPDATE {table_variants} 7061 SET INFO = concat( 7062 INFO, 7063 ';', 7064 'PZTags={pztags_value}' 7065 ) 7066 """ 7067 sql_queries.append(sql_update_pztags_default) 7068 7069 log.info(f"""Profile '{profile}' - Prioritization... """) 7070 7071 if sql_queries: 7072 7073 for sql_query in sql_queries: 7074 log.debug( 7075 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7076 ) 7077 self.conn.execute(sql_query) 7078 7079 log.info(f"""Profile '{profile}' - Update... """) 7080 sql_query_update = f""" 7081 UPDATE {table_variants} 7082 SET INFO = 7083 concat( 7084 CASE 7085 WHEN INFO NOT IN ('','.') 7086 THEN concat(INFO, ';') 7087 ELSE '' 7088 END 7089 {sql_set_info_option} 7090 ) 7091 """ 7092 self.conn.execute(sql_query_update) 7093 7094 else: 7095 7096 log.warning(f"No profiles in parameters") 7097 7098 # Remove added columns 7099 for added_column in added_columns: 7100 self.drop_column(column=added_column) 7101 7102 # Explode INFOS fields into table fields 7103 if self.get_explode_infos(): 7104 self.explode_infos( 7105 prefix=self.get_explode_infos_prefix(), 7106 fields=self.get_explode_infos_fields(), 7107 force=True, 7108 ) 7109 7110 return 7111 7112 ### 7113 # HGVS 7114 ### 7115 7116 def annotation_hgvs(self, threads: int = None) -> None: 7117 """ 7118 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7119 coordinates and alleles. 7120 7121 :param threads: The `threads` parameter is an optional integer that specifies the number of 7122 threads to use for parallel processing. If no value is provided, it will default to the number 7123 of threads obtained from the `get_threads()` method 7124 :type threads: int 7125 """ 7126 7127 # Function for each partition of the Dask Dataframe 7128 def partition_function(partition): 7129 """ 7130 The function `partition_function` applies the `annotation_hgvs_partition` function to 7131 each row of a DataFrame called `partition`. 7132 7133 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7134 to be processed 7135 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7136 the "partition" dataframe along the axis 1. 7137 """ 7138 return partition.apply(annotation_hgvs_partition, axis=1) 7139 7140 def annotation_hgvs_partition(row) -> str: 7141 """ 7142 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7143 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7144 7145 :param row: A dictionary-like object that contains the values for the following keys: 7146 :return: a string that contains the HGVS names associated with the given row of data. 7147 """ 7148 7149 chr = row["CHROM"] 7150 pos = row["POS"] 7151 ref = row["REF"] 7152 alt = row["ALT"] 7153 7154 # Find list of associated transcripts 7155 transcripts_list = list( 7156 polars_conn.execute( 7157 f""" 7158 SELECT transcript 7159 FROM refseq_df 7160 WHERE CHROM='{chr}' 7161 AND POS={pos} 7162 """ 7163 )["transcript"] 7164 ) 7165 7166 # Full HGVS annotation in list 7167 hgvs_full_list = [] 7168 7169 for transcript_name in transcripts_list: 7170 7171 # Transcript 7172 transcript = get_transcript( 7173 transcripts=transcripts, transcript_name=transcript_name 7174 ) 7175 # Exon 7176 if use_exon: 7177 exon = transcript.find_exon_number(pos) 7178 else: 7179 exon = None 7180 # Protein 7181 transcript_protein = None 7182 if use_protein or add_protein or full_format: 7183 transcripts_protein = list( 7184 polars_conn.execute( 7185 f""" 7186 SELECT protein 7187 FROM refseqlink_df 7188 WHERE transcript='{transcript_name}' 7189 LIMIT 1 7190 """ 7191 )["protein"] 7192 ) 7193 if len(transcripts_protein): 7194 transcript_protein = transcripts_protein[0] 7195 7196 # HGVS name 7197 hgvs_name = format_hgvs_name( 7198 chr, 7199 pos, 7200 ref, 7201 alt, 7202 genome=genome, 7203 transcript=transcript, 7204 transcript_protein=transcript_protein, 7205 exon=exon, 7206 use_gene=use_gene, 7207 use_protein=use_protein, 7208 full_format=full_format, 7209 use_version=use_version, 7210 codon_type=codon_type, 7211 ) 7212 hgvs_full_list.append(hgvs_name) 7213 if add_protein and not use_protein and not full_format: 7214 hgvs_name = format_hgvs_name( 7215 chr, 7216 pos, 7217 ref, 7218 alt, 7219 genome=genome, 7220 transcript=transcript, 7221 transcript_protein=transcript_protein, 7222 exon=exon, 7223 use_gene=use_gene, 7224 use_protein=True, 7225 full_format=False, 7226 use_version=use_version, 7227 codon_type=codon_type, 7228 ) 7229 hgvs_full_list.append(hgvs_name) 7230 7231 # Create liste of HGVS annotations 7232 hgvs_full = ",".join(hgvs_full_list) 7233 7234 return hgvs_full 7235 7236 # Polars connexion 7237 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7238 7239 # Config 7240 config = self.get_config() 7241 7242 # Databases 7243 # Genome 7244 databases_genomes_folders = ( 7245 config.get("folders", {}) 7246 .get("databases", {}) 7247 .get("genomes", DEFAULT_GENOME_FOLDER) 7248 ) 7249 databases_genome = ( 7250 config.get("folders", {}).get("databases", {}).get("genomes", "") 7251 ) 7252 # refseq database folder 7253 databases_refseq_folders = ( 7254 config.get("folders", {}) 7255 .get("databases", {}) 7256 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7257 ) 7258 # refseq 7259 databases_refseq = config.get("databases", {}).get("refSeq", None) 7260 # refSeqLink 7261 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7262 7263 # Param 7264 param = self.get_param() 7265 7266 # Quick HGVS 7267 if "hgvs_options" in param and param.get("hgvs_options", ""): 7268 log.info(f"Quick HGVS Annotation:") 7269 if not param.get("hgvs", None): 7270 param["hgvs"] = {} 7271 for option in param.get("hgvs_options", "").split(","): 7272 option_var_val = option.split("=") 7273 option_var = option_var_val[0] 7274 if len(option_var_val) > 1: 7275 option_val = option_var_val[1] 7276 else: 7277 option_val = "True" 7278 if option_val.upper() in ["TRUE"]: 7279 option_val = True 7280 elif option_val.upper() in ["FALSE"]: 7281 option_val = False 7282 log.info(f" {option_var}={option_val}") 7283 param["hgvs"][option_var] = option_val 7284 7285 # Check if HGVS annotation enabled 7286 if "hgvs" in param: 7287 log.info(f"HGVS Annotation... ") 7288 for hgvs_option in param.get("hgvs", {}): 7289 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7290 else: 7291 return 7292 7293 # HGVS Param 7294 param_hgvs = param.get("hgvs", {}) 7295 use_exon = param_hgvs.get("use_exon", False) 7296 use_gene = param_hgvs.get("use_gene", False) 7297 use_protein = param_hgvs.get("use_protein", False) 7298 add_protein = param_hgvs.get("add_protein", False) 7299 full_format = param_hgvs.get("full_format", False) 7300 use_version = param_hgvs.get("use_version", False) 7301 codon_type = param_hgvs.get("codon_type", "3") 7302 7303 # refSseq refSeqLink 7304 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7305 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7306 7307 # Assembly 7308 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7309 7310 # Genome 7311 genome_file = None 7312 if find_genome(databases_genome): 7313 genome_file = find_genome(databases_genome) 7314 else: 7315 genome_file = find_genome( 7316 genome_path=databases_genomes_folders, assembly=assembly 7317 ) 7318 log.debug("Genome: " + str(genome_file)) 7319 7320 # refSseq 7321 refseq_file = find_file_prefix( 7322 input_file=databases_refseq, 7323 prefix="ncbiRefSeq", 7324 folder=databases_refseq_folders, 7325 assembly=assembly, 7326 ) 7327 log.debug("refSeq: " + str(refseq_file)) 7328 7329 # refSeqLink 7330 refseqlink_file = find_file_prefix( 7331 input_file=databases_refseqlink, 7332 prefix="ncbiRefSeqLink", 7333 folder=databases_refseq_folders, 7334 assembly=assembly, 7335 ) 7336 log.debug("refSeqLink: " + str(refseqlink_file)) 7337 7338 # Threads 7339 if not threads: 7340 threads = self.get_threads() 7341 log.debug("Threads: " + str(threads)) 7342 7343 # Variables 7344 table_variants = self.get_table_variants(clause="update") 7345 7346 # Get variants SNV and InDel only 7347 query_variants = f""" 7348 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7349 FROM {table_variants} 7350 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7351 """ 7352 df_variants = self.get_query_to_df(query_variants) 7353 7354 # Added columns 7355 added_columns = [] 7356 7357 # Add hgvs column in variants table 7358 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7359 added_column = self.add_column( 7360 table_variants, hgvs_column_name, "STRING", default_value=None 7361 ) 7362 added_columns.append(added_column) 7363 7364 log.debug(f"refSeq loading...") 7365 # refSeq in duckDB 7366 refseq_table = get_refseq_table( 7367 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7368 ) 7369 # Loading all refSeq in Dataframe 7370 refseq_query = f""" 7371 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7372 FROM {refseq_table} 7373 JOIN df_variants ON ( 7374 {refseq_table}.chrom = df_variants.CHROM 7375 AND {refseq_table}.txStart<=df_variants.POS 7376 AND {refseq_table}.txEnd>=df_variants.POS 7377 ) 7378 """ 7379 refseq_df = self.conn.query(refseq_query).pl() 7380 7381 if refseqlink_file: 7382 log.debug(f"refSeqLink loading...") 7383 # refSeqLink in duckDB 7384 refseqlink_table = get_refseq_table( 7385 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7386 ) 7387 # Loading all refSeqLink in Dataframe 7388 protacc_column = "protAcc_with_ver" 7389 mrnaacc_column = "mrnaAcc_with_ver" 7390 refseqlink_query = f""" 7391 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7392 FROM {refseqlink_table} 7393 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7394 WHERE protAcc_without_ver IS NOT NULL 7395 """ 7396 # Polars Dataframe 7397 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7398 7399 # Read RefSeq transcripts into a python dict/model. 7400 log.debug(f"Transcripts loading...") 7401 with tempfile.TemporaryDirectory() as tmpdir: 7402 transcripts_query = f""" 7403 COPY ( 7404 SELECT {refseq_table}.* 7405 FROM {refseq_table} 7406 JOIN df_variants ON ( 7407 {refseq_table}.chrom=df_variants.CHROM 7408 AND {refseq_table}.txStart<=df_variants.POS 7409 AND {refseq_table}.txEnd>=df_variants.POS 7410 ) 7411 ) 7412 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7413 """ 7414 self.conn.query(transcripts_query) 7415 with open(f"{tmpdir}/transcript.tsv") as infile: 7416 transcripts = read_transcripts(infile) 7417 7418 # Polars connexion 7419 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7420 7421 log.debug("Genome loading...") 7422 # Read genome sequence using pyfaidx. 7423 genome = Fasta(genome_file) 7424 7425 log.debug("Start annotation HGVS...") 7426 7427 # Create 7428 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7429 ddf = dd.from_pandas(df_variants, npartitions=threads) 7430 7431 # Use dask.dataframe.apply() to apply function on each partition 7432 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7433 7434 # Convert Dask DataFrame to Pandas Dataframe 7435 df = ddf.compute() 7436 7437 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7438 with tempfile.TemporaryDirectory() as tmpdir: 7439 df_parquet = os.path.join(tmpdir, "df.parquet") 7440 df.to_parquet(df_parquet) 7441 7442 # Update hgvs column 7443 update_variant_query = f""" 7444 UPDATE {table_variants} 7445 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7446 FROM read_parquet('{df_parquet}') as df 7447 WHERE variants."#CHROM" = df.CHROM 7448 AND variants.POS = df.POS 7449 AND variants.REF = df.REF 7450 AND variants.ALT = df.ALT 7451 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7452 """ 7453 self.execute_query(update_variant_query) 7454 7455 # Update INFO column 7456 sql_query_update = f""" 7457 UPDATE {table_variants} 7458 SET INFO = 7459 concat( 7460 CASE 7461 WHEN INFO NOT IN ('','.') 7462 THEN concat(INFO, ';') 7463 ELSE '' 7464 END, 7465 'hgvs=', 7466 {hgvs_column_name} 7467 ) 7468 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7469 """ 7470 self.execute_query(sql_query_update) 7471 7472 # Add header 7473 HGVS_INFOS = { 7474 "hgvs": { 7475 "ID": "hgvs", 7476 "Number": ".", 7477 "Type": "String", 7478 "Description": f"HGVS annotatation with HOWARD", 7479 } 7480 } 7481 7482 for field in HGVS_INFOS: 7483 field_ID = HGVS_INFOS[field]["ID"] 7484 field_description = HGVS_INFOS[field]["Description"] 7485 self.get_header().infos[field_ID] = vcf.parser._Info( 7486 field_ID, 7487 HGVS_INFOS[field]["Number"], 7488 HGVS_INFOS[field]["Type"], 7489 field_description, 7490 "unknown", 7491 "unknown", 7492 code_type_map[HGVS_INFOS[field]["Type"]], 7493 ) 7494 7495 # Remove added columns 7496 for added_column in added_columns: 7497 self.drop_column(column=added_column) 7498 7499 ### 7500 # Calculation 7501 ### 7502 7503 def get_operations_help( 7504 self, operations_config_dict: dict = {}, operations_config_file: str = None 7505 ) -> list: 7506 7507 # Init 7508 operations_help = [] 7509 7510 # operations 7511 operations = self.get_config_json( 7512 name="calculations", 7513 config_dict=operations_config_dict, 7514 config_file=operations_config_file, 7515 ) 7516 for op in operations: 7517 op_name = operations[op].get("name", op).upper() 7518 op_description = operations[op].get("description", op_name) 7519 op_available = operations[op].get("available", False) 7520 if op_available: 7521 operations_help.append(f" {op_name}: {op_description}") 7522 7523 # Sort operations 7524 operations_help.sort() 7525 7526 # insert header 7527 operations_help.insert(0, "Available calculation operations:") 7528 7529 # Return 7530 return operations_help 7531 7532 def calculation( 7533 self, 7534 operations: dict = {}, 7535 operations_config_dict: dict = {}, 7536 operations_config_file: str = None, 7537 ) -> None: 7538 """ 7539 It takes a list of operations, and for each operation, it checks if it's a python or sql 7540 operation, and then calls the appropriate function 7541 7542 param json example: 7543 "calculation": { 7544 "NOMEN": { 7545 "options": { 7546 "hgvs_field": "hgvs" 7547 }, 7548 "middle" : null 7549 } 7550 """ 7551 7552 # Param 7553 param = self.get_param() 7554 7555 # operations config 7556 operations_config = self.get_config_json( 7557 name="calculations", 7558 config_dict=operations_config_dict, 7559 config_file=operations_config_file, 7560 ) 7561 7562 # Upper keys 7563 operations_config = {k.upper(): v for k, v in operations_config.items()} 7564 7565 # Calculations 7566 7567 # Operations from param 7568 operations = param.get("calculation", {}).get("calculations", operations) 7569 7570 # Quick calculation - add 7571 if param.get("calculations", None): 7572 calculations_list = [ 7573 value for value in param.get("calculations", "").split(",") 7574 ] 7575 log.info(f"Quick Calculations:") 7576 for calculation_key in calculations_list: 7577 log.info(f" {calculation_key}") 7578 for calculation_operation in calculations_list: 7579 if calculation_operation.upper() not in operations: 7580 operations[calculation_operation.upper()] = {} 7581 add_value_into_dict( 7582 dict_tree=param, 7583 sections=[ 7584 "calculation", 7585 "calculations", 7586 calculation_operation.upper(), 7587 ], 7588 value={}, 7589 ) 7590 7591 # Operations for calculation 7592 if not operations: 7593 operations = param.get("calculation", {}).get("calculations", {}) 7594 7595 if operations: 7596 log.info(f"Calculations...") 7597 7598 # For each operations 7599 for operation_name in operations: 7600 operation_name = operation_name.upper() 7601 if operation_name not in [""]: 7602 if operation_name in operations_config: 7603 log.info(f"Calculation '{operation_name}'") 7604 operation = operations_config[operation_name] 7605 operation_type = operation.get("type", "sql") 7606 if operation_type == "python": 7607 self.calculation_process_function( 7608 operation=operation, operation_name=operation_name 7609 ) 7610 elif operation_type == "sql": 7611 self.calculation_process_sql( 7612 operation=operation, operation_name=operation_name 7613 ) 7614 else: 7615 log.error( 7616 f"Operations config: Type '{operation_type}' NOT available" 7617 ) 7618 raise ValueError( 7619 f"Operations config: Type '{operation_type}' NOT available" 7620 ) 7621 else: 7622 log.error( 7623 f"Operations config: Calculation '{operation_name}' NOT available" 7624 ) 7625 raise ValueError( 7626 f"Operations config: Calculation '{operation_name}' NOT available" 7627 ) 7628 7629 # Explode INFOS fields into table fields 7630 if self.get_explode_infos(): 7631 self.explode_infos( 7632 prefix=self.get_explode_infos_prefix(), 7633 fields=self.get_explode_infos_fields(), 7634 force=True, 7635 ) 7636 7637 def calculation_process_sql( 7638 self, operation: dict, operation_name: str = "unknown" 7639 ) -> None: 7640 """ 7641 The `calculation_process_sql` function takes in a mathematical operation as a string and 7642 performs the operation, updating the specified table with the result. 7643 7644 :param operation: The `operation` parameter is a dictionary that contains information about the 7645 mathematical operation to be performed. It includes the following keys: 7646 :type operation: dict 7647 :param operation_name: The `operation_name` parameter is a string that represents the name of 7648 the mathematical operation being performed. It is used for logging and error handling purposes, 7649 defaults to unknown 7650 :type operation_name: str (optional) 7651 """ 7652 7653 # table variants 7654 table_variants = self.get_table_variants(clause="alter") 7655 7656 # Operation infos 7657 operation_name = operation.get("name", "unknown") 7658 log.debug(f"process sql {operation_name}") 7659 output_column_name = operation.get("output_column_name", operation_name) 7660 output_column_type = operation.get("output_column_type", "String") 7661 prefix = operation.get("explode_infos_prefix", "") 7662 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7663 output_column_description = operation.get( 7664 "output_column_description", f"{operation_name} operation" 7665 ) 7666 operation_query = operation.get("operation_query", None) 7667 if isinstance(operation_query, list): 7668 operation_query = " ".join(operation_query) 7669 operation_info_fields = operation.get("info_fields", []) 7670 operation_info_fields_check = operation.get("info_fields_check", False) 7671 operation_info = operation.get("operation_info", True) 7672 7673 if operation_query: 7674 7675 # Info fields check 7676 operation_info_fields_check_result = True 7677 if operation_info_fields_check: 7678 header_infos = self.get_header().infos 7679 for info_field in operation_info_fields: 7680 operation_info_fields_check_result = ( 7681 operation_info_fields_check_result 7682 and info_field in header_infos 7683 ) 7684 7685 # If info fields available 7686 if operation_info_fields_check_result: 7687 7688 # Added_columns 7689 added_columns = [] 7690 7691 # Create VCF header field 7692 vcf_reader = self.get_header() 7693 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7694 output_column_name, 7695 ".", 7696 output_column_type, 7697 output_column_description, 7698 "howard calculation", 7699 "0", 7700 self.code_type_map.get(output_column_type), 7701 ) 7702 7703 # Explode infos if needed 7704 log.debug(f"calculation_process_sql prefix {prefix}") 7705 added_columns += self.explode_infos( 7706 prefix=prefix, 7707 fields=[output_column_name] + operation_info_fields, 7708 force=True, 7709 ) 7710 7711 # Create column 7712 added_column = self.add_column( 7713 table_name=table_variants, 7714 column_name=prefix + output_column_name, 7715 column_type=output_column_type_sql, 7716 default_value="null", 7717 ) 7718 added_columns.append(added_column) 7719 7720 # Operation calculation 7721 try: 7722 7723 # Query to update calculation column 7724 sql_update = f""" 7725 UPDATE {table_variants} 7726 SET "{prefix}{output_column_name}" = ({operation_query}) 7727 """ 7728 self.conn.execute(sql_update) 7729 7730 # Add to INFO 7731 if operation_info: 7732 sql_update_info = f""" 7733 UPDATE {table_variants} 7734 SET "INFO" = 7735 concat( 7736 CASE 7737 WHEN "INFO" IS NOT NULL 7738 THEN concat("INFO", ';') 7739 ELSE '' 7740 END, 7741 '{output_column_name}=', 7742 "{prefix}{output_column_name}" 7743 ) 7744 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7745 """ 7746 self.conn.execute(sql_update_info) 7747 7748 except: 7749 log.error( 7750 f"Operations config: Calculation '{operation_name}' query failed" 7751 ) 7752 raise ValueError( 7753 f"Operations config: Calculation '{operation_name}' query failed" 7754 ) 7755 7756 # Remove added columns 7757 for added_column in added_columns: 7758 log.debug(f"added_column: {added_column}") 7759 self.drop_column(column=added_column) 7760 7761 else: 7762 log.error( 7763 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7764 ) 7765 raise ValueError( 7766 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7767 ) 7768 7769 else: 7770 log.error( 7771 f"Operations config: Calculation '{operation_name}' query NOT defined" 7772 ) 7773 raise ValueError( 7774 f"Operations config: Calculation '{operation_name}' query NOT defined" 7775 ) 7776 7777 def calculation_process_function( 7778 self, operation: dict, operation_name: str = "unknown" 7779 ) -> None: 7780 """ 7781 The `calculation_process_function` takes in an operation dictionary and performs the specified 7782 function with the given parameters. 7783 7784 :param operation: The `operation` parameter is a dictionary that contains information about the 7785 operation to be performed. It has the following keys: 7786 :type operation: dict 7787 :param operation_name: The `operation_name` parameter is a string that represents the name of 7788 the operation being performed. It is used for logging purposes, defaults to unknown 7789 :type operation_name: str (optional) 7790 """ 7791 7792 operation_name = operation["name"] 7793 log.debug(f"process sql {operation_name}") 7794 function_name = operation["function_name"] 7795 function_params = operation["function_params"] 7796 getattr(self, function_name)(*function_params) 7797 7798 def calculation_variant_id(self) -> None: 7799 """ 7800 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7801 updates the INFO field of a variants table with the variant ID. 7802 """ 7803 7804 # variant_id annotation field 7805 variant_id_tag = self.get_variant_id_column() 7806 added_columns = [variant_id_tag] 7807 7808 # variant_id hgvs tags" 7809 vcf_infos_tags = { 7810 variant_id_tag: "howard variant ID annotation", 7811 } 7812 7813 # Variants table 7814 table_variants = self.get_table_variants() 7815 7816 # Header 7817 vcf_reader = self.get_header() 7818 7819 # Add variant_id to header 7820 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7821 variant_id_tag, 7822 ".", 7823 "String", 7824 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7825 "howard calculation", 7826 "0", 7827 self.code_type_map.get("String"), 7828 ) 7829 7830 # Update 7831 sql_update = f""" 7832 UPDATE {table_variants} 7833 SET "INFO" = 7834 concat( 7835 CASE 7836 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7837 THEN '' 7838 ELSE concat("INFO", ';') 7839 END, 7840 '{variant_id_tag}=', 7841 "{variant_id_tag}" 7842 ) 7843 """ 7844 self.conn.execute(sql_update) 7845 7846 # Remove added columns 7847 for added_column in added_columns: 7848 self.drop_column(column=added_column) 7849 7850 def calculation_extract_snpeff_hgvs( 7851 self, 7852 snpeff_hgvs: str = "snpeff_hgvs", 7853 snpeff_field: str = "ANN", 7854 ) -> None: 7855 """ 7856 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7857 annotation field in a VCF file and adds them as a new column in the variants table. 7858 7859 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7860 function is used to specify the name of the column that will store the HGVS nomenclatures 7861 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7862 snpeff_hgvs 7863 :type snpeff_hgvs: str (optional) 7864 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7865 function represents the field in the VCF file that contains SnpEff annotations. This field is 7866 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7867 to ANN 7868 :type snpeff_field: str (optional) 7869 """ 7870 7871 # Snpeff hgvs tags 7872 vcf_infos_tags = { 7873 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7874 } 7875 7876 # Prefix 7877 prefix = self.get_explode_infos_prefix() 7878 if prefix: 7879 prefix = "INFO/" 7880 7881 # snpEff fields 7882 speff_ann_infos = prefix + snpeff_field 7883 speff_hgvs_infos = prefix + snpeff_hgvs 7884 7885 # Variants table 7886 table_variants = self.get_table_variants() 7887 7888 # Header 7889 vcf_reader = self.get_header() 7890 7891 # Add columns 7892 added_columns = [] 7893 7894 # Explode HGVS field in column 7895 added_columns += self.explode_infos(fields=[snpeff_field]) 7896 7897 if snpeff_field in vcf_reader.infos: 7898 7899 log.debug(vcf_reader.infos[snpeff_field]) 7900 7901 # Extract ANN header 7902 ann_description = vcf_reader.infos[snpeff_field].desc 7903 pattern = r"'(.+?)'" 7904 match = re.search(pattern, ann_description) 7905 if match: 7906 ann_header_match = match.group(1).split(" | ") 7907 ann_header_desc = {} 7908 for i in range(len(ann_header_match)): 7909 ann_header_info = "".join( 7910 char for char in ann_header_match[i] if char.isalnum() 7911 ) 7912 ann_header_desc[ann_header_info] = ann_header_match[i] 7913 if not ann_header_desc: 7914 raise ValueError("Invalid header description format") 7915 else: 7916 raise ValueError("Invalid header description format") 7917 7918 # Create variant id 7919 variant_id_column = self.get_variant_id_column() 7920 added_columns += [variant_id_column] 7921 7922 # Create dataframe 7923 dataframe_snpeff_hgvs = self.get_query_to_df( 7924 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7925 ) 7926 7927 # Create main NOMEN column 7928 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7929 speff_ann_infos 7930 ].apply( 7931 lambda x: extract_snpeff_hgvs( 7932 str(x), header=list(ann_header_desc.values()) 7933 ) 7934 ) 7935 7936 # Add snpeff_hgvs to header 7937 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7938 snpeff_hgvs, 7939 ".", 7940 "String", 7941 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7942 "howard calculation", 7943 "0", 7944 self.code_type_map.get("String"), 7945 ) 7946 7947 # Update 7948 sql_update = f""" 7949 UPDATE variants 7950 SET "INFO" = 7951 concat( 7952 CASE 7953 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7954 THEN '' 7955 ELSE concat("INFO", ';') 7956 END, 7957 CASE 7958 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7959 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7960 THEN concat( 7961 '{snpeff_hgvs}=', 7962 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7963 ) 7964 ELSE '' 7965 END 7966 ) 7967 FROM dataframe_snpeff_hgvs 7968 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7969 7970 """ 7971 self.conn.execute(sql_update) 7972 7973 # Delete dataframe 7974 del dataframe_snpeff_hgvs 7975 gc.collect() 7976 7977 else: 7978 7979 log.warning( 7980 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7981 ) 7982 7983 # Remove added columns 7984 for added_column in added_columns: 7985 self.drop_column(column=added_column) 7986 7987 def calculation_snpeff_ann_explode( 7988 self, 7989 uniquify: bool = True, 7990 output_format: str = "fields", 7991 output_prefix: str = "snpeff_", 7992 snpeff_field: str = "ANN", 7993 ) -> None: 7994 """ 7995 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7996 exploding the HGVS field and updating variant information accordingly. 7997 7998 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 7999 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8000 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8001 defaults to True 8002 :type uniquify: bool (optional) 8003 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8004 function specifies the format in which the output annotations will be generated. It has a 8005 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8006 format, defaults to fields 8007 :type output_format: str (optional) 8008 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8009 method is used to specify the prefix that will be added to the output annotations generated 8010 during the calculation process. This prefix helps to differentiate the newly added annotations 8011 from existing ones in the output data. By default, the, defaults to ANN_ 8012 :type output_prefix: str (optional) 8013 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8014 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8015 field will be processed to explode the HGVS annotations and update the variant information 8016 accordingly, defaults to ANN 8017 :type snpeff_field: str (optional) 8018 """ 8019 8020 # SnpEff annotation field 8021 snpeff_hgvs = "snpeff_ann_explode" 8022 8023 # Snpeff hgvs tags 8024 vcf_infos_tags = { 8025 snpeff_hgvs: "Explode snpEff annotations", 8026 } 8027 8028 # Prefix 8029 prefix = self.get_explode_infos_prefix() 8030 if prefix: 8031 prefix = "INFO/" 8032 8033 # snpEff fields 8034 speff_ann_infos = prefix + snpeff_field 8035 speff_hgvs_infos = prefix + snpeff_hgvs 8036 8037 # Variants table 8038 table_variants = self.get_table_variants() 8039 8040 # Header 8041 vcf_reader = self.get_header() 8042 8043 # Add columns 8044 added_columns = [] 8045 8046 # Explode HGVS field in column 8047 added_columns += self.explode_infos(fields=[snpeff_field]) 8048 log.debug(f"snpeff_field={snpeff_field}") 8049 log.debug(f"added_columns={added_columns}") 8050 8051 if snpeff_field in vcf_reader.infos: 8052 8053 # Extract ANN header 8054 ann_description = vcf_reader.infos[snpeff_field].desc 8055 pattern = r"'(.+?)'" 8056 match = re.search(pattern, ann_description) 8057 if match: 8058 ann_header_match = match.group(1).split(" | ") 8059 ann_header = [] 8060 ann_header_desc = {} 8061 for i in range(len(ann_header_match)): 8062 ann_header_info = "".join( 8063 char for char in ann_header_match[i] if char.isalnum() 8064 ) 8065 ann_header.append(ann_header_info) 8066 ann_header_desc[ann_header_info] = ann_header_match[i] 8067 if not ann_header_desc: 8068 raise ValueError("Invalid header description format") 8069 else: 8070 raise ValueError("Invalid header description format") 8071 8072 # Create variant id 8073 variant_id_column = self.get_variant_id_column() 8074 added_columns += [variant_id_column] 8075 8076 # Create dataframe 8077 dataframe_snpeff_hgvs = self.get_query_to_df( 8078 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8079 ) 8080 8081 # Create snpEff columns 8082 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8083 speff_ann_infos 8084 ].apply( 8085 lambda x: explode_snpeff_ann( 8086 str(x), 8087 uniquify=uniquify, 8088 output_format=output_format, 8089 prefix=output_prefix, 8090 header=list(ann_header_desc.values()), 8091 ) 8092 ) 8093 8094 # Header 8095 ann_annotations_prefix = "" 8096 if output_format.upper() in ["JSON"]: 8097 ann_annotations_prefix = f"{output_prefix}=" 8098 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8099 output_prefix, 8100 ".", 8101 "String", 8102 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8103 + " - JSON format", 8104 "howard calculation", 8105 "0", 8106 self.code_type_map.get("String"), 8107 ) 8108 else: 8109 for ann_annotation in ann_header: 8110 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8111 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8112 ann_annotation_id, 8113 ".", 8114 "String", 8115 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8116 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8117 "howard calculation", 8118 "0", 8119 self.code_type_map.get("String"), 8120 ) 8121 8122 # Update 8123 sql_update = f""" 8124 UPDATE variants 8125 SET "INFO" = 8126 concat( 8127 CASE 8128 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8129 THEN '' 8130 ELSE concat("INFO", ';') 8131 END, 8132 CASE 8133 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8134 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8135 THEN concat( 8136 '{ann_annotations_prefix}', 8137 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8138 ) 8139 ELSE '' 8140 END 8141 ) 8142 FROM dataframe_snpeff_hgvs 8143 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8144 8145 """ 8146 self.conn.execute(sql_update) 8147 8148 # Delete dataframe 8149 del dataframe_snpeff_hgvs 8150 gc.collect() 8151 8152 else: 8153 8154 log.warning( 8155 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8156 ) 8157 8158 # Remove added columns 8159 for added_column in added_columns: 8160 self.drop_column(column=added_column) 8161 8162 def calculation_extract_nomen(self) -> None: 8163 """ 8164 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8165 """ 8166 8167 # NOMEN field 8168 field_nomen_dict = "NOMEN_DICT" 8169 8170 # NOMEN structure 8171 nomen_dict = { 8172 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8173 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8174 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8175 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8176 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8177 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8178 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8179 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8180 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8181 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8182 } 8183 8184 # Param 8185 param = self.get_param() 8186 8187 # Prefix 8188 prefix = self.get_explode_infos_prefix() 8189 8190 # Header 8191 vcf_reader = self.get_header() 8192 8193 # Get HGVS field 8194 hgvs_field = ( 8195 param.get("calculation", {}) 8196 .get("calculations", {}) 8197 .get("NOMEN", {}) 8198 .get("options", {}) 8199 .get("hgvs_field", "hgvs") 8200 ) 8201 8202 # Get transcripts 8203 transcripts_file = ( 8204 param.get("calculation", {}) 8205 .get("calculations", {}) 8206 .get("NOMEN", {}) 8207 .get("options", {}) 8208 .get("transcripts", None) 8209 ) 8210 transcripts_file = full_path(transcripts_file) 8211 transcripts = [] 8212 if transcripts_file: 8213 if os.path.exists(transcripts_file): 8214 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8215 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8216 else: 8217 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8218 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8219 8220 # Added columns 8221 added_columns = [] 8222 8223 # Explode HGVS field in column 8224 added_columns += self.explode_infos(fields=[hgvs_field]) 8225 8226 # extra infos 8227 extra_infos = self.get_extra_infos() 8228 extra_field = prefix + hgvs_field 8229 8230 if extra_field in extra_infos: 8231 8232 # Create dataframe 8233 dataframe_hgvs = self.get_query_to_df( 8234 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8235 ) 8236 8237 # Create main NOMEN column 8238 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8239 lambda x: find_nomen(str(x), transcripts=transcripts) 8240 ) 8241 8242 # Explode NOMEN Structure and create SQL set for update 8243 sql_nomen_fields = [] 8244 for nomen_field in nomen_dict: 8245 8246 # Explode each field into a column 8247 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8248 lambda x: dict(x).get(nomen_field, "") 8249 ) 8250 8251 # Create VCF header field 8252 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8253 nomen_field, 8254 ".", 8255 "String", 8256 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8257 "howard calculation", 8258 "0", 8259 self.code_type_map.get("String"), 8260 ) 8261 sql_nomen_fields.append( 8262 f""" 8263 CASE 8264 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8265 THEN concat( 8266 ';{nomen_field}=', 8267 dataframe_hgvs."{nomen_field}" 8268 ) 8269 ELSE '' 8270 END 8271 """ 8272 ) 8273 8274 # SQL set for update 8275 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8276 8277 # Update 8278 sql_update = f""" 8279 UPDATE variants 8280 SET "INFO" = 8281 concat( 8282 CASE 8283 WHEN "INFO" IS NULL 8284 THEN '' 8285 ELSE "INFO" 8286 END, 8287 {sql_nomen_fields_set} 8288 ) 8289 FROM dataframe_hgvs 8290 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8291 AND variants."POS" = dataframe_hgvs."POS" 8292 AND variants."REF" = dataframe_hgvs."REF" 8293 AND variants."ALT" = dataframe_hgvs."ALT" 8294 """ 8295 self.conn.execute(sql_update) 8296 8297 # Delete dataframe 8298 del dataframe_hgvs 8299 gc.collect() 8300 8301 # Remove added columns 8302 for added_column in added_columns: 8303 self.drop_column(column=added_column) 8304 8305 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8306 """ 8307 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8308 pipeline/sample for a variant and updates the variant information in a VCF file. 8309 8310 :param tag: The `tag` parameter is a string that represents the annotation field for the 8311 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8312 VCF header and to update the corresponding field in the variants table, defaults to 8313 findbypipeline 8314 :type tag: str (optional) 8315 """ 8316 8317 # if FORMAT and samples 8318 if ( 8319 "FORMAT" in self.get_header_columns_as_list() 8320 and self.get_header_sample_list() 8321 ): 8322 8323 # findbypipeline annotation field 8324 findbypipeline_tag = tag 8325 8326 # VCF infos tags 8327 vcf_infos_tags = { 8328 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8329 } 8330 8331 # Prefix 8332 prefix = self.get_explode_infos_prefix() 8333 8334 # Field 8335 findbypipeline_infos = prefix + findbypipeline_tag 8336 8337 # Variants table 8338 table_variants = self.get_table_variants() 8339 8340 # Header 8341 vcf_reader = self.get_header() 8342 8343 # Create variant id 8344 variant_id_column = self.get_variant_id_column() 8345 added_columns = [variant_id_column] 8346 8347 # variant_id, FORMAT and samples 8348 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8349 self.get_header_sample_list() 8350 ) 8351 8352 # Create dataframe 8353 dataframe_findbypipeline = self.get_query_to_df( 8354 f""" SELECT {samples_fields} FROM {table_variants} """ 8355 ) 8356 8357 # Create findbypipeline column 8358 dataframe_findbypipeline[findbypipeline_infos] = ( 8359 dataframe_findbypipeline.apply( 8360 lambda row: findbypipeline( 8361 row, samples=self.get_header_sample_list() 8362 ), 8363 axis=1, 8364 ) 8365 ) 8366 8367 # Add snpeff_hgvs to header 8368 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8369 findbypipeline_tag, 8370 ".", 8371 "String", 8372 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8373 "howard calculation", 8374 "0", 8375 self.code_type_map.get("String"), 8376 ) 8377 8378 # Update 8379 sql_update = f""" 8380 UPDATE variants 8381 SET "INFO" = 8382 concat( 8383 CASE 8384 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8385 THEN '' 8386 ELSE concat("INFO", ';') 8387 END, 8388 CASE 8389 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8390 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8391 THEN concat( 8392 '{findbypipeline_tag}=', 8393 dataframe_findbypipeline."{findbypipeline_infos}" 8394 ) 8395 ELSE '' 8396 END 8397 ) 8398 FROM dataframe_findbypipeline 8399 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8400 """ 8401 self.conn.execute(sql_update) 8402 8403 # Remove added columns 8404 for added_column in added_columns: 8405 self.drop_column(column=added_column) 8406 8407 # Delete dataframe 8408 del dataframe_findbypipeline 8409 gc.collect() 8410 8411 def calculation_genotype_concordance(self) -> None: 8412 """ 8413 The function `calculation_genotype_concordance` calculates the genotype concordance for 8414 multi-caller VCF files and updates the variant information in the database. 8415 """ 8416 8417 # if FORMAT and samples 8418 if ( 8419 "FORMAT" in self.get_header_columns_as_list() 8420 and self.get_header_sample_list() 8421 ): 8422 8423 # genotypeconcordance annotation field 8424 genotypeconcordance_tag = "genotypeconcordance" 8425 8426 # VCF infos tags 8427 vcf_infos_tags = { 8428 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8429 } 8430 8431 # Prefix 8432 prefix = self.get_explode_infos_prefix() 8433 8434 # Field 8435 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8436 8437 # Variants table 8438 table_variants = self.get_table_variants() 8439 8440 # Header 8441 vcf_reader = self.get_header() 8442 8443 # Create variant id 8444 variant_id_column = self.get_variant_id_column() 8445 added_columns = [variant_id_column] 8446 8447 # variant_id, FORMAT and samples 8448 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8449 self.get_header_sample_list() 8450 ) 8451 8452 # Create dataframe 8453 dataframe_genotypeconcordance = self.get_query_to_df( 8454 f""" SELECT {samples_fields} FROM {table_variants} """ 8455 ) 8456 8457 # Create genotypeconcordance column 8458 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8459 dataframe_genotypeconcordance.apply( 8460 lambda row: genotypeconcordance( 8461 row, samples=self.get_header_sample_list() 8462 ), 8463 axis=1, 8464 ) 8465 ) 8466 8467 # Add genotypeconcordance to header 8468 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8469 genotypeconcordance_tag, 8470 ".", 8471 "String", 8472 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8473 "howard calculation", 8474 "0", 8475 self.code_type_map.get("String"), 8476 ) 8477 8478 # Update 8479 sql_update = f""" 8480 UPDATE variants 8481 SET "INFO" = 8482 concat( 8483 CASE 8484 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8485 THEN '' 8486 ELSE concat("INFO", ';') 8487 END, 8488 CASE 8489 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8490 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8491 THEN concat( 8492 '{genotypeconcordance_tag}=', 8493 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8494 ) 8495 ELSE '' 8496 END 8497 ) 8498 FROM dataframe_genotypeconcordance 8499 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8500 """ 8501 self.conn.execute(sql_update) 8502 8503 # Remove added columns 8504 for added_column in added_columns: 8505 self.drop_column(column=added_column) 8506 8507 # Delete dataframe 8508 del dataframe_genotypeconcordance 8509 gc.collect() 8510 8511 def calculation_barcode(self, tag: str = "barcode") -> None: 8512 """ 8513 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8514 updates the INFO field in the file with the calculated barcode values. 8515 8516 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8517 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8518 the default tag name is set to "barcode", defaults to barcode 8519 :type tag: str (optional) 8520 """ 8521 8522 # if FORMAT and samples 8523 if ( 8524 "FORMAT" in self.get_header_columns_as_list() 8525 and self.get_header_sample_list() 8526 ): 8527 8528 # barcode annotation field 8529 if not tag: 8530 tag = "barcode" 8531 8532 # VCF infos tags 8533 vcf_infos_tags = { 8534 tag: "barcode calculation (VaRank)", 8535 } 8536 8537 # Prefix 8538 prefix = self.get_explode_infos_prefix() 8539 8540 # Field 8541 barcode_infos = prefix + tag 8542 8543 # Variants table 8544 table_variants = self.get_table_variants() 8545 8546 # Header 8547 vcf_reader = self.get_header() 8548 8549 # Create variant id 8550 variant_id_column = self.get_variant_id_column() 8551 added_columns = [variant_id_column] 8552 8553 # variant_id, FORMAT and samples 8554 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8555 self.get_header_sample_list() 8556 ) 8557 8558 # Create dataframe 8559 dataframe_barcode = self.get_query_to_df( 8560 f""" SELECT {samples_fields} FROM {table_variants} """ 8561 ) 8562 8563 # Create barcode column 8564 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8565 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8566 ) 8567 8568 # Add barcode to header 8569 vcf_reader.infos[tag] = vcf.parser._Info( 8570 tag, 8571 ".", 8572 "String", 8573 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8574 "howard calculation", 8575 "0", 8576 self.code_type_map.get("String"), 8577 ) 8578 8579 # Update 8580 sql_update = f""" 8581 UPDATE {table_variants} 8582 SET "INFO" = 8583 concat( 8584 CASE 8585 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8586 THEN '' 8587 ELSE concat("INFO", ';') 8588 END, 8589 CASE 8590 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8591 AND dataframe_barcode."{barcode_infos}" NOT NULL 8592 THEN concat( 8593 '{tag}=', 8594 dataframe_barcode."{barcode_infos}" 8595 ) 8596 ELSE '' 8597 END 8598 ) 8599 FROM dataframe_barcode 8600 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8601 """ 8602 self.conn.execute(sql_update) 8603 8604 # Remove added columns 8605 for added_column in added_columns: 8606 self.drop_column(column=added_column) 8607 8608 # Delete dataframe 8609 del dataframe_barcode 8610 gc.collect() 8611 8612 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8613 """ 8614 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8615 and updates the INFO field in the file with the calculated barcode values. 8616 8617 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8618 the barcode tag that will be added to the VCF file during the calculation process. If no value 8619 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8620 :type tag: str (optional) 8621 """ 8622 8623 # if FORMAT and samples 8624 if ( 8625 "FORMAT" in self.get_header_columns_as_list() 8626 and self.get_header_sample_list() 8627 ): 8628 8629 # barcode annotation field 8630 if not tag: 8631 tag = "BCF" 8632 8633 # VCF infos tags 8634 vcf_infos_tags = { 8635 tag: "barcode family calculation", 8636 f"{tag}S": "barcode family samples", 8637 } 8638 8639 # Param 8640 param = self.get_param() 8641 log.debug(f"param={param}") 8642 8643 # Prefix 8644 prefix = self.get_explode_infos_prefix() 8645 8646 # PED param 8647 ped = ( 8648 param.get("calculation", {}) 8649 .get("calculations", {}) 8650 .get("BARCODEFAMILY", {}) 8651 .get("family_pedigree", None) 8652 ) 8653 log.debug(f"ped={ped}") 8654 8655 # Load PED 8656 if ped: 8657 8658 # Pedigree is a file 8659 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8660 log.debug("Pedigree is file") 8661 with open(full_path(ped)) as ped: 8662 ped = json.load(ped) 8663 8664 # Pedigree is a string 8665 elif isinstance(ped, str): 8666 log.debug("Pedigree is str") 8667 try: 8668 ped = json.loads(ped) 8669 log.debug("Pedigree is json str") 8670 except ValueError as e: 8671 ped_samples = ped.split(",") 8672 ped = {} 8673 for ped_sample in ped_samples: 8674 ped[ped_sample] = ped_sample 8675 8676 # Pedigree is a dict 8677 elif isinstance(ped, dict): 8678 log.debug("Pedigree is dict") 8679 8680 # Pedigree is not well formatted 8681 else: 8682 msg_error = "Pedigree not well formatted" 8683 log.error(msg_error) 8684 raise ValueError(msg_error) 8685 8686 # Construct list 8687 ped_samples = list(ped.values()) 8688 8689 else: 8690 log.debug("Pedigree not defined. Take all samples") 8691 ped_samples = self.get_header_sample_list() 8692 ped = {} 8693 for ped_sample in ped_samples: 8694 ped[ped_sample] = ped_sample 8695 8696 # Check pedigree 8697 if not ped or len(ped) == 0: 8698 msg_error = f"Error in pedigree: samples {ped_samples}" 8699 log.error(msg_error) 8700 raise ValueError(msg_error) 8701 8702 # Log 8703 log.info( 8704 "Calculation 'BARCODEFAMILY' - Samples: " 8705 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8706 ) 8707 log.debug(f"ped_samples={ped_samples}") 8708 8709 # Field 8710 barcode_infos = prefix + tag 8711 8712 # Variants table 8713 table_variants = self.get_table_variants() 8714 8715 # Header 8716 vcf_reader = self.get_header() 8717 8718 # Create variant id 8719 variant_id_column = self.get_variant_id_column() 8720 added_columns = [variant_id_column] 8721 8722 # variant_id, FORMAT and samples 8723 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8724 ped_samples 8725 ) 8726 8727 # Create dataframe 8728 dataframe_barcode = self.get_query_to_df( 8729 f""" SELECT {samples_fields} FROM {table_variants} """ 8730 ) 8731 8732 # Create barcode column 8733 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8734 lambda row: barcode(row, samples=ped_samples), axis=1 8735 ) 8736 8737 # Add barcode family to header 8738 # Add vaf_normalization to header 8739 vcf_reader.formats[tag] = vcf.parser._Format( 8740 id=tag, 8741 num=".", 8742 type="String", 8743 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8744 type_code=self.code_type_map.get("String"), 8745 ) 8746 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8747 id=f"{tag}S", 8748 num=".", 8749 type="String", 8750 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8751 type_code=self.code_type_map.get("String"), 8752 ) 8753 8754 # Update 8755 # for sample in ped_samples: 8756 sql_update_set = [] 8757 for sample in self.get_header_sample_list() + ["FORMAT"]: 8758 if sample in ped_samples: 8759 value = f'dataframe_barcode."{barcode_infos}"' 8760 value_samples = "'" + ",".join(ped_samples) + "'" 8761 elif sample == "FORMAT": 8762 value = f"'{tag}'" 8763 value_samples = f"'{tag}S'" 8764 else: 8765 value = "'.'" 8766 value_samples = "'.'" 8767 format_regex = r"[a-zA-Z0-9\s]" 8768 sql_update_set.append( 8769 f""" 8770 "{sample}" = 8771 concat( 8772 CASE 8773 WHEN {table_variants}."{sample}" = './.' 8774 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8775 ELSE {table_variants}."{sample}" 8776 END, 8777 ':', 8778 {value}, 8779 ':', 8780 {value_samples} 8781 ) 8782 """ 8783 ) 8784 8785 sql_update_set_join = ", ".join(sql_update_set) 8786 sql_update = f""" 8787 UPDATE {table_variants} 8788 SET {sql_update_set_join} 8789 FROM dataframe_barcode 8790 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8791 """ 8792 self.conn.execute(sql_update) 8793 8794 # Remove added columns 8795 for added_column in added_columns: 8796 self.drop_column(column=added_column) 8797 8798 # Delete dataframe 8799 del dataframe_barcode 8800 gc.collect() 8801 8802 def calculation_trio(self) -> None: 8803 """ 8804 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8805 information to the INFO field of each variant. 8806 """ 8807 8808 # if FORMAT and samples 8809 if ( 8810 "FORMAT" in self.get_header_columns_as_list() 8811 and self.get_header_sample_list() 8812 ): 8813 8814 # trio annotation field 8815 trio_tag = "trio" 8816 8817 # VCF infos tags 8818 vcf_infos_tags = { 8819 "trio": "trio calculation", 8820 } 8821 8822 # Param 8823 param = self.get_param() 8824 8825 # Prefix 8826 prefix = self.get_explode_infos_prefix() 8827 8828 # Trio param 8829 trio_ped = ( 8830 param.get("calculation", {}) 8831 .get("calculations", {}) 8832 .get("TRIO", {}) 8833 .get("trio_pedigree", None) 8834 ) 8835 8836 # Load trio 8837 if trio_ped: 8838 8839 # Trio pedigree is a file 8840 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8841 log.debug("TRIO pedigree is file") 8842 with open(full_path(trio_ped)) as trio_ped: 8843 trio_ped = json.load(trio_ped) 8844 8845 # Trio pedigree is a string 8846 elif isinstance(trio_ped, str): 8847 log.debug("TRIO pedigree is str") 8848 try: 8849 trio_ped = json.loads(trio_ped) 8850 log.debug("TRIO pedigree is json str") 8851 except ValueError as e: 8852 trio_samples = trio_ped.split(",") 8853 if len(trio_samples) == 3: 8854 trio_ped = { 8855 "father": trio_samples[0], 8856 "mother": trio_samples[1], 8857 "child": trio_samples[2], 8858 } 8859 log.debug("TRIO pedigree is list str") 8860 else: 8861 msg_error = "TRIO pedigree not well formatted" 8862 log.error(msg_error) 8863 raise ValueError(msg_error) 8864 8865 # Trio pedigree is a dict 8866 elif isinstance(trio_ped, dict): 8867 log.debug("TRIO pedigree is dict") 8868 8869 # Trio pedigree is not well formatted 8870 else: 8871 msg_error = "TRIO pedigree not well formatted" 8872 log.error(msg_error) 8873 raise ValueError(msg_error) 8874 8875 # Construct trio list 8876 trio_samples = [ 8877 trio_ped.get("father", ""), 8878 trio_ped.get("mother", ""), 8879 trio_ped.get("child", ""), 8880 ] 8881 8882 else: 8883 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8884 samples_list = self.get_header_sample_list() 8885 if len(samples_list) >= 3: 8886 trio_samples = self.get_header_sample_list()[0:3] 8887 trio_ped = { 8888 "father": trio_samples[0], 8889 "mother": trio_samples[1], 8890 "child": trio_samples[2], 8891 } 8892 else: 8893 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8894 log.error(msg_error) 8895 raise ValueError(msg_error) 8896 8897 # Check trio pedigree 8898 if not trio_ped or len(trio_ped) != 3: 8899 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8900 log.error(msg_error) 8901 raise ValueError(msg_error) 8902 8903 # Log 8904 log.info( 8905 f"Calculation 'TRIO' - Samples: " 8906 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8907 ) 8908 8909 # Field 8910 trio_infos = prefix + trio_tag 8911 8912 # Variants table 8913 table_variants = self.get_table_variants() 8914 8915 # Header 8916 vcf_reader = self.get_header() 8917 8918 # Create variant id 8919 variant_id_column = self.get_variant_id_column() 8920 added_columns = [variant_id_column] 8921 8922 # variant_id, FORMAT and samples 8923 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8924 self.get_header_sample_list() 8925 ) 8926 8927 # Create dataframe 8928 dataframe_trio = self.get_query_to_df( 8929 f""" SELECT {samples_fields} FROM {table_variants} """ 8930 ) 8931 8932 # Create trio column 8933 dataframe_trio[trio_infos] = dataframe_trio.apply( 8934 lambda row: trio(row, samples=trio_samples), axis=1 8935 ) 8936 8937 # Add trio to header 8938 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8939 trio_tag, 8940 ".", 8941 "String", 8942 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8943 "howard calculation", 8944 "0", 8945 self.code_type_map.get("String"), 8946 ) 8947 8948 # Update 8949 sql_update = f""" 8950 UPDATE {table_variants} 8951 SET "INFO" = 8952 concat( 8953 CASE 8954 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8955 THEN '' 8956 ELSE concat("INFO", ';') 8957 END, 8958 CASE 8959 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8960 AND dataframe_trio."{trio_infos}" NOT NULL 8961 THEN concat( 8962 '{trio_tag}=', 8963 dataframe_trio."{trio_infos}" 8964 ) 8965 ELSE '' 8966 END 8967 ) 8968 FROM dataframe_trio 8969 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8970 """ 8971 self.conn.execute(sql_update) 8972 8973 # Remove added columns 8974 for added_column in added_columns: 8975 self.drop_column(column=added_column) 8976 8977 # Delete dataframe 8978 del dataframe_trio 8979 gc.collect() 8980 8981 def calculation_vaf_normalization(self) -> None: 8982 """ 8983 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8984 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8985 :return: The function does not return anything. 8986 """ 8987 8988 # if FORMAT and samples 8989 if ( 8990 "FORMAT" in self.get_header_columns_as_list() 8991 and self.get_header_sample_list() 8992 ): 8993 8994 # vaf_normalization annotation field 8995 vaf_normalization_tag = "VAF" 8996 8997 # VCF infos tags 8998 vcf_infos_tags = { 8999 "VAF": "VAF Variant Frequency", 9000 } 9001 9002 # Prefix 9003 prefix = self.get_explode_infos_prefix() 9004 9005 # Variants table 9006 table_variants = self.get_table_variants() 9007 9008 # Header 9009 vcf_reader = self.get_header() 9010 9011 # Do not calculate if VAF already exists 9012 if "VAF" in vcf_reader.formats: 9013 log.debug("VAF already on genotypes") 9014 return 9015 9016 # Create variant id 9017 variant_id_column = self.get_variant_id_column() 9018 added_columns = [variant_id_column] 9019 9020 # variant_id, FORMAT and samples 9021 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9022 f""" "{sample}" """ for sample in self.get_header_sample_list() 9023 ) 9024 9025 # Create dataframe 9026 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9027 log.debug(f"query={query}") 9028 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9029 9030 vaf_normalization_set = [] 9031 9032 # for each sample vaf_normalization 9033 for sample in self.get_header_sample_list(): 9034 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9035 lambda row: vaf_normalization(row, sample=sample), axis=1 9036 ) 9037 vaf_normalization_set.append( 9038 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9039 ) 9040 9041 # Add VAF to FORMAT 9042 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9043 "FORMAT" 9044 ].apply(lambda x: str(x) + ":VAF") 9045 vaf_normalization_set.append( 9046 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9047 ) 9048 9049 # Add vaf_normalization to header 9050 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9051 id=vaf_normalization_tag, 9052 num="1", 9053 type="Float", 9054 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9055 type_code=self.code_type_map.get("Float"), 9056 ) 9057 9058 # Create fields to add in INFO 9059 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9060 9061 # Update 9062 sql_update = f""" 9063 UPDATE {table_variants} 9064 SET {sql_vaf_normalization_set} 9065 FROM dataframe_vaf_normalization 9066 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9067 9068 """ 9069 self.conn.execute(sql_update) 9070 9071 # Remove added columns 9072 for added_column in added_columns: 9073 self.drop_column(column=added_column) 9074 9075 # Delete dataframe 9076 del dataframe_vaf_normalization 9077 gc.collect() 9078 9079 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9080 """ 9081 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9082 field in a VCF file and updates the INFO column of the variants table with the calculated 9083 statistics. 9084 9085 :param info: The `info` parameter is a string that represents the type of information for which 9086 genotype statistics are calculated. It is used to generate various VCF info tags for the 9087 statistics, such as the number of occurrences, the list of values, the minimum value, the 9088 maximum value, the mean, the median, defaults to VAF 9089 :type info: str (optional) 9090 """ 9091 9092 # if FORMAT and samples 9093 if ( 9094 "FORMAT" in self.get_header_columns_as_list() 9095 and self.get_header_sample_list() 9096 ): 9097 9098 # vaf_stats annotation field 9099 vaf_stats_tag = info + "_stats" 9100 9101 # VCF infos tags 9102 vcf_infos_tags = { 9103 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9104 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9105 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9106 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9107 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9108 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9109 info 9110 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9111 } 9112 9113 # Prefix 9114 prefix = self.get_explode_infos_prefix() 9115 9116 # Field 9117 vaf_stats_infos = prefix + vaf_stats_tag 9118 9119 # Variants table 9120 table_variants = self.get_table_variants() 9121 9122 # Header 9123 vcf_reader = self.get_header() 9124 9125 # Create variant id 9126 variant_id_column = self.get_variant_id_column() 9127 added_columns = [variant_id_column] 9128 9129 # variant_id, FORMAT and samples 9130 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9131 self.get_header_sample_list() 9132 ) 9133 9134 # Create dataframe 9135 dataframe_vaf_stats = self.get_query_to_df( 9136 f""" SELECT {samples_fields} FROM {table_variants} """ 9137 ) 9138 9139 # Create vaf_stats column 9140 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9141 lambda row: genotype_stats( 9142 row, samples=self.get_header_sample_list(), info=info 9143 ), 9144 axis=1, 9145 ) 9146 9147 # List of vcf tags 9148 sql_vaf_stats_fields = [] 9149 9150 # Check all VAF stats infos 9151 for stat in vcf_infos_tags: 9152 9153 # Extract stats 9154 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9155 lambda x: dict(x).get(stat, "") 9156 ) 9157 9158 # Add snpeff_hgvs to header 9159 vcf_reader.infos[stat] = vcf.parser._Info( 9160 stat, 9161 ".", 9162 "String", 9163 vcf_infos_tags.get(stat, "genotype statistics"), 9164 "howard calculation", 9165 "0", 9166 self.code_type_map.get("String"), 9167 ) 9168 9169 if len(sql_vaf_stats_fields): 9170 sep = ";" 9171 else: 9172 sep = "" 9173 9174 # Create fields to add in INFO 9175 sql_vaf_stats_fields.append( 9176 f""" 9177 CASE 9178 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9179 THEN concat( 9180 '{sep}{stat}=', 9181 dataframe_vaf_stats."{stat}" 9182 ) 9183 ELSE '' 9184 END 9185 """ 9186 ) 9187 9188 # SQL set for update 9189 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9190 9191 # Update 9192 sql_update = f""" 9193 UPDATE {table_variants} 9194 SET "INFO" = 9195 concat( 9196 CASE 9197 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9198 THEN '' 9199 ELSE concat("INFO", ';') 9200 END, 9201 {sql_vaf_stats_fields_set} 9202 ) 9203 FROM dataframe_vaf_stats 9204 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9205 9206 """ 9207 self.conn.execute(sql_update) 9208 9209 # Remove added columns 9210 for added_column in added_columns: 9211 self.drop_column(column=added_column) 9212 9213 # Delete dataframe 9214 del dataframe_vaf_stats 9215 gc.collect() 9216 9217 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9218 """ 9219 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9220 to it if transcripts are available. 9221 9222 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9223 parameter that specifies the information field to be used in the transcripts JSON. It has a 9224 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9225 transcripts_json 9226 :type info: str (optional) 9227 """ 9228 9229 # Create transcripts table 9230 transcripts_table = self.create_transcript_view() 9231 9232 # Add info field 9233 if transcripts_table: 9234 self.transcript_view_to_variants( 9235 transcripts_table=transcripts_table, transcripts_info_field=info 9236 ) 9237 else: 9238 log.info("No Transcripts to process. Check param.json file configuration") 9239 9240 ############### 9241 # Transcripts # 9242 ############### 9243 9244 def create_transcript_view_from_columns_map( 9245 self, 9246 transcripts_table: str = "transcripts", 9247 columns_maps: dict = {}, 9248 added_columns: list = [], 9249 temporary_tables: list = None, 9250 annotation_fields: list = None, 9251 ) -> tuple[list, list, list]: 9252 """ 9253 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9254 specified columns mapping for transcripts data. 9255 9256 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9257 the table where the transcripts data is stored or will be stored in the database. This table 9258 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9259 predictions, etc. It defaults to "transcripts, defaults to transcripts 9260 :type transcripts_table: str (optional) 9261 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9262 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9263 represents a mapping configuration for a specific set of columns. It typically includes details such 9264 as the main transcript column and additional information columns 9265 :type columns_maps: dict 9266 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9267 function is a list that stores the additional columns that will be added to the view being created 9268 based on the columns map provided. These columns are generated by exploding the transcript 9269 information columns along with the main transcript column 9270 :type added_columns: list 9271 :param temporary_tables: The `temporary_tables` parameter in the 9272 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9273 tables created during the process of creating a transcript view from a columns map. These temporary 9274 tables are used to store intermediate results or transformations before the final view is generated 9275 :type temporary_tables: list 9276 :param annotation_fields: The `annotation_fields` parameter in the 9277 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9278 for annotation in the query view creation process. These fields are extracted from the 9279 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9280 :type annotation_fields: list 9281 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9282 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9283 """ 9284 9285 log.debug("Start transcrpts view creation from columns map...") 9286 9287 # "from_columns_map": [ 9288 # { 9289 # "transcripts_column": "Ensembl_transcriptid", 9290 # "transcripts_infos_columns": [ 9291 # "genename", 9292 # "Ensembl_geneid", 9293 # "LIST_S2_score", 9294 # "LIST_S2_pred", 9295 # ], 9296 # }, 9297 # { 9298 # "transcripts_column": "Ensembl_transcriptid", 9299 # "transcripts_infos_columns": [ 9300 # "genename", 9301 # "VARITY_R_score", 9302 # "Aloft_pred", 9303 # ], 9304 # }, 9305 # ], 9306 9307 # Init 9308 if temporary_tables is None: 9309 temporary_tables = [] 9310 if annotation_fields is None: 9311 annotation_fields = [] 9312 9313 # Variants table 9314 table_variants = self.get_table_variants() 9315 9316 for columns_map in columns_maps: 9317 9318 # Transcript column 9319 transcripts_column = columns_map.get("transcripts_column", None) 9320 9321 # Transcripts infos columns 9322 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9323 9324 if transcripts_column is not None: 9325 9326 # Explode 9327 added_columns += self.explode_infos( 9328 fields=[transcripts_column] + transcripts_infos_columns 9329 ) 9330 9331 # View clauses 9332 clause_select = [] 9333 for field in [transcripts_column] + transcripts_infos_columns: 9334 clause_select.append( 9335 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9336 ) 9337 if field not in [transcripts_column]: 9338 annotation_fields.append(field) 9339 9340 # Querey View 9341 query = f""" 9342 SELECT 9343 "#CHROM", POS, REF, ALT, 9344 "{transcripts_column}" AS 'transcript', 9345 {", ".join(clause_select)} 9346 FROM ( 9347 SELECT 9348 "#CHROM", POS, REF, ALT, 9349 {", ".join(clause_select)} 9350 FROM {table_variants} 9351 ) 9352 WHERE "{transcripts_column}" IS NOT NULL 9353 """ 9354 9355 # Create temporary table 9356 temporary_table = transcripts_table + "".join( 9357 random.choices(string.ascii_uppercase + string.digits, k=10) 9358 ) 9359 9360 # Temporary_tables 9361 temporary_tables.append(temporary_table) 9362 query_view = f""" 9363 CREATE TEMPORARY TABLE {temporary_table} 9364 AS ({query}) 9365 """ 9366 self.execute_query(query=query_view) 9367 9368 return added_columns, temporary_tables, annotation_fields 9369 9370 def create_transcript_view_from_column_format( 9371 self, 9372 transcripts_table: str = "transcripts", 9373 column_formats: dict = {}, 9374 temporary_tables: list = None, 9375 annotation_fields: list = None, 9376 ) -> tuple[list, list, list]: 9377 """ 9378 The `create_transcript_view_from_column_format` function generates a transcript view based on 9379 specified column formats, adds additional columns and annotation fields, and returns the list of 9380 temporary tables and annotation fields. 9381 9382 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9383 the table containing the transcripts data. This table will be used as the base table for creating 9384 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9385 different table name if needed, defaults to transcripts 9386 :type transcripts_table: str (optional) 9387 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9388 about the columns to be used for creating the transcript view. Each entry in the dictionary 9389 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9390 the provided code snippet: 9391 :type column_formats: dict 9392 :param temporary_tables: The `temporary_tables` parameter in the 9393 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9394 views created during the process of creating a transcript view from a column format. These temporary 9395 views are used to manipulate and extract data before generating the final transcript view. It 9396 :type temporary_tables: list 9397 :param annotation_fields: The `annotation_fields` parameter in the 9398 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9399 that are extracted from the temporary views created during the process. These annotation fields are 9400 obtained by querying the temporary views and extracting the column names excluding specific columns 9401 like `#CH 9402 :type annotation_fields: list 9403 :return: The `create_transcript_view_from_column_format` function returns two lists: 9404 `temporary_tables` and `annotation_fields`. 9405 """ 9406 9407 log.debug("Start transcrpts view creation from column format...") 9408 9409 # "from_column_format": [ 9410 # { 9411 # "transcripts_column": "ANN", 9412 # "transcripts_infos_column": "Feature_ID", 9413 # } 9414 # ], 9415 9416 # Init 9417 if temporary_tables is None: 9418 temporary_tables = [] 9419 if annotation_fields is None: 9420 annotation_fields = [] 9421 9422 for column_format in column_formats: 9423 9424 # annotation field and transcript annotation field 9425 annotation_field = column_format.get("transcripts_column", "ANN") 9426 transcript_annotation = column_format.get( 9427 "transcripts_infos_column", "Feature_ID" 9428 ) 9429 9430 # Temporary View name 9431 temporary_view_name = transcripts_table + "".join( 9432 random.choices(string.ascii_uppercase + string.digits, k=10) 9433 ) 9434 9435 # Create temporary view name 9436 temporary_view_name = self.annotation_format_to_table( 9437 uniquify=True, 9438 annotation_field=annotation_field, 9439 view_name=temporary_view_name, 9440 annotation_id=transcript_annotation, 9441 ) 9442 9443 # Annotation fields 9444 if temporary_view_name: 9445 query_annotation_fields = f""" 9446 SELECT * 9447 FROM ( 9448 DESCRIBE SELECT * 9449 FROM {temporary_view_name} 9450 ) 9451 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9452 """ 9453 df_annotation_fields = self.get_query_to_df( 9454 query=query_annotation_fields 9455 ) 9456 9457 # Add temporary view and annotation fields 9458 temporary_tables.append(temporary_view_name) 9459 annotation_fields += list(set(df_annotation_fields["column_name"])) 9460 9461 return temporary_tables, annotation_fields 9462 9463 def create_transcript_view( 9464 self, 9465 transcripts_table: str = None, 9466 transcripts_table_drop: bool = True, 9467 param: dict = {}, 9468 ) -> str: 9469 """ 9470 The `create_transcript_view` function generates a transcript view by processing data from a 9471 specified table based on provided parameters and structural information. 9472 9473 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9474 is used to specify the name of the table that will store the final transcript view data. If a table 9475 name is not provided, the function will create a new table to store the transcript view data, and by 9476 default,, defaults to transcripts 9477 :type transcripts_table: str (optional) 9478 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9479 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9480 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9481 the function will drop the existing transcripts table if it exists, defaults to True 9482 :type transcripts_table_drop: bool (optional) 9483 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9484 contains information needed to create a transcript view. It includes details such as the structure 9485 of the transcripts, columns mapping, column formats, and other necessary information for generating 9486 the view. This parameter allows for flexibility and customization 9487 :type param: dict 9488 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9489 created or modified during the execution of the function. 9490 """ 9491 9492 log.debug("Start transcrpts view creation...") 9493 9494 # Default 9495 transcripts_table_default = "transcripts" 9496 9497 # Param 9498 if not param: 9499 param = self.get_param() 9500 9501 # Struct 9502 struct = param.get("transcripts", {}).get("struct", None) 9503 9504 if struct: 9505 9506 # Transcripts table 9507 if transcripts_table is None: 9508 transcripts_table = param.get("transcripts", {}).get( 9509 "table", transcripts_table_default 9510 ) 9511 9512 # added_columns 9513 added_columns = [] 9514 9515 # Temporary tables 9516 temporary_tables = [] 9517 9518 # Annotation fields 9519 annotation_fields = [] 9520 9521 # from columns map 9522 columns_maps = struct.get("from_columns_map", []) 9523 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9524 self.create_transcript_view_from_columns_map( 9525 transcripts_table=transcripts_table, 9526 columns_maps=columns_maps, 9527 added_columns=added_columns, 9528 temporary_tables=temporary_tables, 9529 annotation_fields=annotation_fields, 9530 ) 9531 ) 9532 added_columns += added_columns_tmp 9533 temporary_tables += temporary_tables_tmp 9534 annotation_fields += annotation_fields_tmp 9535 9536 # from column format 9537 column_formats = struct.get("from_column_format", []) 9538 temporary_tables_tmp, annotation_fields_tmp = ( 9539 self.create_transcript_view_from_column_format( 9540 transcripts_table=transcripts_table, 9541 column_formats=column_formats, 9542 temporary_tables=temporary_tables, 9543 annotation_fields=annotation_fields, 9544 ) 9545 ) 9546 temporary_tables += temporary_tables_tmp 9547 annotation_fields += annotation_fields_tmp 9548 9549 # Merge temporary tables query 9550 query_merge = "" 9551 for temporary_table in temporary_tables: 9552 9553 # First temporary table 9554 if not query_merge: 9555 query_merge = f""" 9556 SELECT * FROM {temporary_table} 9557 """ 9558 # other temporary table (using UNION) 9559 else: 9560 query_merge += f""" 9561 UNION BY NAME SELECT * FROM {temporary_table} 9562 """ 9563 9564 # Merge on transcript 9565 query_merge_on_transcripts_annotation_fields = [] 9566 # Aggregate all annotations fields 9567 for annotation_field in set(annotation_fields): 9568 query_merge_on_transcripts_annotation_fields.append( 9569 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9570 ) 9571 # Query for transcripts view 9572 query_merge_on_transcripts = f""" 9573 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9574 FROM ({query_merge}) 9575 GROUP BY "#CHROM", POS, REF, ALT, transcript 9576 """ 9577 9578 # Drop transcript view is necessary 9579 if transcripts_table_drop: 9580 query_drop = f""" 9581 DROP TABLE IF EXISTS {transcripts_table}; 9582 """ 9583 self.execute_query(query=query_drop) 9584 9585 # Merge and create transcript view 9586 query_create_view = f""" 9587 CREATE TABLE IF NOT EXISTS {transcripts_table} 9588 AS {query_merge_on_transcripts} 9589 """ 9590 self.execute_query(query=query_create_view) 9591 9592 # Remove added columns 9593 for added_column in added_columns: 9594 self.drop_column(column=added_column) 9595 9596 else: 9597 9598 transcripts_table = None 9599 9600 return transcripts_table 9601 9602 def annotation_format_to_table( 9603 self, 9604 uniquify: bool = True, 9605 annotation_field: str = "ANN", 9606 annotation_id: str = "Feature_ID", 9607 view_name: str = "transcripts", 9608 ) -> str: 9609 """ 9610 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9611 table format. 9612 9613 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9614 values in the output or not. If set to `True`, the function will make sure that the output values 9615 are unique, defaults to True 9616 :type uniquify: bool (optional) 9617 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9618 contains the annotation information for each variant. This field is used to extract the annotation 9619 details for further processing in the function, defaults to ANN 9620 :type annotation_field: str (optional) 9621 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9622 used to specify the identifier for the annotation feature. This identifier will be used as a column 9623 name in the resulting table or view that is created based on the annotation data. It helps in 9624 uniquely identifying each annotation entry in the, defaults to Feature_ID 9625 :type annotation_id: str (optional) 9626 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9627 specify the name of the temporary table that will be created to store the transformed annotation 9628 data. This table will hold the extracted information from the annotation field in a structured 9629 format for further processing or analysis, defaults to transcripts 9630 :type view_name: str (optional) 9631 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9632 is stored in the variable `view_name`. 9633 """ 9634 9635 # Annotation field 9636 annotation_format = "annotation_explode" 9637 9638 # Transcript annotation 9639 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9640 9641 # Prefix 9642 prefix = self.get_explode_infos_prefix() 9643 if prefix: 9644 prefix = "INFO/" 9645 9646 # Annotation fields 9647 annotation_infos = prefix + annotation_field 9648 annotation_format_infos = prefix + annotation_format 9649 9650 # Variants table 9651 table_variants = self.get_table_variants() 9652 9653 # Header 9654 vcf_reader = self.get_header() 9655 9656 # Add columns 9657 added_columns = [] 9658 9659 # Explode HGVS field in column 9660 added_columns += self.explode_infos(fields=[annotation_field]) 9661 9662 if annotation_field in vcf_reader.infos: 9663 9664 # Extract ANN header 9665 ann_description = vcf_reader.infos[annotation_field].desc 9666 pattern = r"'(.+?)'" 9667 match = re.search(pattern, ann_description) 9668 if match: 9669 ann_header_match = match.group(1).split(" | ") 9670 ann_header = [] 9671 ann_header_desc = {} 9672 for i in range(len(ann_header_match)): 9673 ann_header_info = "".join( 9674 char for char in ann_header_match[i] if char.isalnum() 9675 ) 9676 ann_header.append(ann_header_info) 9677 ann_header_desc[ann_header_info] = ann_header_match[i] 9678 if not ann_header_desc: 9679 raise ValueError("Invalid header description format") 9680 else: 9681 raise ValueError("Invalid header description format") 9682 9683 # Create variant id 9684 variant_id_column = self.get_variant_id_column() 9685 added_columns += [variant_id_column] 9686 9687 # Create dataframe 9688 dataframe_annotation_format = self.get_query_to_df( 9689 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9690 ) 9691 9692 # Create annotation columns 9693 dataframe_annotation_format[ 9694 annotation_format_infos 9695 ] = dataframe_annotation_format[annotation_infos].apply( 9696 lambda x: explode_annotation_format( 9697 annotation=str(x), 9698 uniquify=uniquify, 9699 output_format="JSON", 9700 prefix="", 9701 header=list(ann_header_desc.values()), 9702 ) 9703 ) 9704 9705 # Find keys 9706 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9707 df_keys = self.get_query_to_df(query=query_json) 9708 9709 # Check keys 9710 query_json_key = [] 9711 for _, row in df_keys.iterrows(): 9712 9713 # Key 9714 key = row.iloc[0] 9715 9716 # key_clean 9717 key_clean = "".join(char for char in key if char.isalnum()) 9718 9719 # Type 9720 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9721 9722 # Get DataFrame from query 9723 df_json_type = self.get_query_to_df(query=query_json_type) 9724 9725 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9726 with pd.option_context("future.no_silent_downcasting", True): 9727 df_json_type.fillna(value="", inplace=True) 9728 replace_dict = {None: np.nan, "": np.nan} 9729 df_json_type.replace(replace_dict, inplace=True) 9730 df_json_type.dropna(inplace=True) 9731 9732 # Detect column type 9733 column_type = detect_column_type(df_json_type[key_clean]) 9734 9735 # Append 9736 query_json_key.append( 9737 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9738 ) 9739 9740 # Create view 9741 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9742 self.execute_query(query=query_view) 9743 9744 else: 9745 9746 # Return None 9747 view_name = None 9748 9749 # Remove added columns 9750 for added_column in added_columns: 9751 self.drop_column(column=added_column) 9752 9753 return view_name 9754 9755 def transcript_view_to_variants( 9756 self, 9757 transcripts_table: str = None, 9758 transcripts_column_id: str = None, 9759 transcripts_info_json: str = None, 9760 transcripts_info_field: str = None, 9761 param: dict = {}, 9762 ) -> bool: 9763 """ 9764 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9765 a variants table with information from the transcripts in JSON format. 9766 9767 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 9768 containing the transcripts data. If this parameter is not provided, the function will attempt to 9769 retrieve it from the `param` dictionary or use a default value of "transcripts" 9770 :type transcripts_table: str 9771 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 9772 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 9773 used to match transcripts with variants in the database 9774 :type transcripts_column_id: str 9775 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 9776 the column in the variants table where the transcripts information will be stored in JSON format 9777 :type transcripts_info_json: str 9778 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 9779 in the VCF header that will contain information about transcripts in JSON format. This field will be 9780 added to the VCF header as an INFO field with the specified name 9781 :type transcripts_info_field: str 9782 :param param: The `transcript_view_to_variants` method takes several parameters: 9783 :type param: dict 9784 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 9785 operation is successful and `False` if certain conditions are not met. 9786 """ 9787 9788 log.debug("Start transcripts view to JSON...") 9789 9790 # Default 9791 transcripts_table_default = "transcripts" 9792 transcripts_column_id_default = "transcript" 9793 transcripts_info_json_default = None 9794 transcripts_info_field_default = None 9795 9796 # Param 9797 if not param: 9798 param = self.get_param() 9799 9800 # Transcripts table 9801 if transcripts_table is None: 9802 transcripts_table = param.get("transcripts", {}).get( 9803 "table", transcripts_table_default 9804 ) 9805 9806 # Transcripts column ID 9807 if transcripts_column_id is None: 9808 transcripts_column_id = param.get("transcripts", {}).get( 9809 "column_id", transcripts_column_id_default 9810 ) 9811 9812 # Transcripts info field 9813 if transcripts_info_json is None: 9814 transcripts_info_json = param.get("transcripts", {}).get( 9815 "transcripts_info_json", transcripts_info_json_default 9816 ) 9817 9818 # Transcripts info field 9819 if transcripts_info_field is None: 9820 transcripts_info_field = param.get("transcripts", {}).get( 9821 "transcripts_info_field", transcripts_info_field_default 9822 ) 9823 9824 # Variants table 9825 table_variants = self.get_table_variants() 9826 9827 # Check info columns param 9828 if transcripts_info_json is None and transcripts_info_field is None: 9829 return False 9830 9831 # Transcripts infos columns 9832 query_transcripts_infos_columns = f""" 9833 SELECT * 9834 FROM ( 9835 DESCRIBE SELECT * FROM {transcripts_table} 9836 ) 9837 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 9838 """ 9839 transcripts_infos_columns = list( 9840 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 9841 ) 9842 9843 # View results 9844 clause_select = [] 9845 clause_to_json = [] 9846 for field in transcripts_infos_columns: 9847 clause_select.append( 9848 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9849 ) 9850 clause_to_json.append(f""" '{field}': "{field}" """) 9851 9852 # Update 9853 update_set = [] 9854 9855 # VCF header 9856 vcf_reader = self.get_header() 9857 9858 # Transcripts to info column in JSON 9859 if transcripts_info_json is not None: 9860 9861 # Create column on variants table 9862 self.add_column( 9863 table_name=table_variants, 9864 column_name=transcripts_info_json, 9865 column_type="JSON", 9866 default_value=None, 9867 drop=False, 9868 ) 9869 9870 # Add to update 9871 update_set.append( 9872 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 9873 ) 9874 9875 # Add header 9876 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 9877 transcripts_info_json, 9878 ".", 9879 "String", 9880 "Transcripts in JSON format", 9881 "unknwon", 9882 "unknwon", 9883 self.code_type_map["String"], 9884 ) 9885 9886 # Transcripts to info field in JSON 9887 if transcripts_info_field is not None: 9888 9889 # Add to update 9890 update_set.append( 9891 f""" 9892 INFO = concat( 9893 CASE 9894 WHEN INFO NOT IN ('', '.') 9895 THEN INFO 9896 ELSE '' 9897 END, 9898 CASE 9899 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 9900 THEN concat( 9901 ';{transcripts_info_field}=', 9902 t.{transcripts_info_json} 9903 ) 9904 ELSE '' 9905 END 9906 ) 9907 """ 9908 ) 9909 9910 # Add header 9911 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 9912 transcripts_info_field, 9913 ".", 9914 "String", 9915 "Transcripts in JSON format", 9916 "unknwon", 9917 "unknwon", 9918 self.code_type_map["String"], 9919 ) 9920 9921 # Update query 9922 query_update = f""" 9923 UPDATE {table_variants} 9924 SET {", ".join(update_set)} 9925 FROM 9926 ( 9927 SELECT 9928 "#CHROM", POS, REF, ALT, 9929 concat( 9930 '{{', 9931 string_agg( 9932 '"' || "{transcripts_column_id}" || '":' || 9933 to_json(json_output) 9934 ), 9935 '}}' 9936 )::JSON AS {transcripts_info_json} 9937 FROM 9938 ( 9939 SELECT 9940 "#CHROM", POS, REF, ALT, 9941 "{transcripts_column_id}", 9942 to_json( 9943 {{{",".join(clause_to_json)}}} 9944 )::JSON AS json_output 9945 FROM 9946 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 9947 WHERE "{transcripts_column_id}" IS NOT NULL 9948 ) 9949 GROUP BY "#CHROM", POS, REF, ALT 9950 ) AS t 9951 WHERE {table_variants}."#CHROM" = t."#CHROM" 9952 AND {table_variants}."POS" = t."POS" 9953 AND {table_variants}."REF" = t."REF" 9954 AND {table_variants}."ALT" = t."ALT" 9955 """ 9956 9957 self.execute_query(query=query_update) 9958 9959 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data() 80 81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "") 106 107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config 120 121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param 131 132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = [] 160 161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False) 169 170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config 211 212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict 236 237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db 266 267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 312 313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None 339 340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None 441 442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df 483 484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None 526 527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats 748 749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file 771 772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None 873 874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input 880 881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format 897 898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed 915 916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output 923 924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format 942 943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config 949 950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param 956 957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db 963 964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix 970 971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants 998 999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 ) 1010 1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory") 1018 1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn 1026 1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close() 1033 1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required 1053 1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0 1072 1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return "" 1083 1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return [] 1094 1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list) 1105 1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples 1113 1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False) 1122 1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format 1134 1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1188 1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes() 1384 1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False) 1394 1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if fields_search != [field]: 1466 fields_search = sorted( 1467 list(set(fields_search).difference(fields_input)) 1468 ) 1469 1470 # If field is not in header (avoid not well formatted header) 1471 if not fields_search and not remove_fields_not_in_header: 1472 fields_search = [field] 1473 1474 # Add found fields 1475 for new_field in fields_search: 1476 # Add field, if not already exists, and if it is in header (if asked) 1477 if ( 1478 new_field not in fields_output 1479 and ( 1480 not remove_fields_not_in_header 1481 or new_field in fields_in_header 1482 ) 1483 and new_field not in [".*"] 1484 ): 1485 fields_output.append(new_field) 1486 1487 return fields_output 1488 1489 else: 1490 1491 return [] 1492 1493 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1494 """ 1495 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1496 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1497 not provided. 1498 1499 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1500 prefix to be used for exploding or expanding information 1501 :type explode_infos_prefix: str 1502 :return: the value of the variable `explode_infos_prefix`. 1503 """ 1504 1505 if not explode_infos_prefix: 1506 explode_infos_prefix = ( 1507 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1508 ) 1509 1510 return explode_infos_prefix 1511 1512 def add_column( 1513 self, 1514 table_name, 1515 column_name, 1516 column_type, 1517 default_value=None, 1518 drop: bool = False, 1519 ) -> dict: 1520 """ 1521 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1522 doesn't already exist. 1523 1524 :param table_name: The name of the table to which you want to add a column 1525 :param column_name: The parameter "column_name" is the name of the column that you want to add 1526 to the table 1527 :param column_type: The `column_type` parameter specifies the data type of the column that you 1528 want to add to the table. It should be a string that represents the desired data type, such as 1529 "INTEGER", "TEXT", "REAL", etc 1530 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1531 default value for the newly added column. If a default value is provided, it will be assigned to 1532 the column for any existing rows that do not have a value for that column 1533 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1534 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1535 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1536 to False 1537 :type drop: bool (optional) 1538 :return: a boolean value indicating whether the column was successfully added to the table. 1539 """ 1540 1541 # added 1542 added = False 1543 dropped = False 1544 1545 # Check if the column already exists in the table 1546 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1547 columns = self.get_query_to_df(query).columns.tolist() 1548 if column_name in columns: 1549 log.debug( 1550 f"The {column_name} column already exists in the {table_name} table" 1551 ) 1552 if drop: 1553 self.drop_column(table_name=table_name, column_name=column_name) 1554 dropped = True 1555 else: 1556 return None 1557 else: 1558 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1559 1560 # Add column in table 1561 add_column_query = ( 1562 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1563 ) 1564 if default_value is not None: 1565 add_column_query += f" DEFAULT {default_value}" 1566 self.execute_query(add_column_query) 1567 added = not dropped 1568 log.debug( 1569 f"The {column_name} column was successfully added to the {table_name} table" 1570 ) 1571 1572 if added: 1573 added_column = { 1574 "table_name": table_name, 1575 "column_name": column_name, 1576 "column_type": column_type, 1577 "default_value": default_value, 1578 } 1579 else: 1580 added_column = None 1581 1582 return added_column 1583 1584 def drop_column( 1585 self, column: dict = None, table_name: str = None, column_name: str = None 1586 ) -> bool: 1587 """ 1588 The `drop_column` function drops a specified column from a given table in a database and returns 1589 True if the column was successfully dropped, and False if the column does not exist in the 1590 table. 1591 1592 :param column: The `column` parameter is a dictionary that contains information about the column 1593 you want to drop. It has two keys: 1594 :type column: dict 1595 :param table_name: The `table_name` parameter is the name of the table from which you want to 1596 drop a column 1597 :type table_name: str 1598 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1599 from the table 1600 :type column_name: str 1601 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1602 and False if the column does not exist in the table. 1603 """ 1604 1605 # Find column infos 1606 if column: 1607 if isinstance(column, dict): 1608 table_name = column.get("table_name", None) 1609 column_name = column.get("column_name", None) 1610 elif isinstance(column, str): 1611 table_name = self.get_table_variants() 1612 column_name = column 1613 else: 1614 table_name = None 1615 column_name = None 1616 1617 if not table_name and not column_name: 1618 return False 1619 1620 # Removed 1621 removed = False 1622 1623 # Check if the column already exists in the table 1624 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1625 columns = self.get_query_to_df(query).columns.tolist() 1626 if column_name in columns: 1627 log.debug(f"The {column_name} column exists in the {table_name} table") 1628 else: 1629 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1630 return False 1631 1632 # Add column in table # ALTER TABLE integers DROP k 1633 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1634 self.execute_query(add_column_query) 1635 removed = True 1636 log.debug( 1637 f"The {column_name} column was successfully dropped to the {table_name} table" 1638 ) 1639 1640 return removed 1641 1642 def explode_infos( 1643 self, 1644 prefix: str = None, 1645 create_index: bool = False, 1646 fields: list = None, 1647 force: bool = False, 1648 proccess_all_fields_together: bool = False, 1649 ) -> list: 1650 """ 1651 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1652 columns, returning a list of added columns. 1653 1654 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1655 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1656 `self.get_explode_infos_prefix()` as the prefix 1657 :type prefix: str 1658 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1659 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1660 `False`, indexes will not be created. The default value is `False`, defaults to False 1661 :type create_index: bool (optional) 1662 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1663 individual columns. If this parameter is not provided, all INFO fields will be exploded 1664 :type fields: list 1665 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1666 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1667 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1668 defaults to False 1669 :type force: bool (optional) 1670 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1671 flag that determines whether to process all the INFO fields together or individually. If set to 1672 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1673 be processed individually, defaults to False 1674 :type proccess_all_fields_together: bool (optional) 1675 :return: The function `explode_infos` returns a list of added columns. 1676 """ 1677 1678 # drop indexes 1679 self.drop_indexes() 1680 1681 # connexion format 1682 connexion_format = self.get_connexion_format() 1683 1684 # Access 1685 access = self.get_config().get("access", None) 1686 1687 # Added columns 1688 added_columns = [] 1689 1690 if access not in ["RO"]: 1691 1692 # prefix 1693 if prefix in [None, True] or not isinstance(prefix, str): 1694 if self.get_explode_infos_prefix() not in [None, True]: 1695 prefix = self.get_explode_infos_prefix() 1696 else: 1697 prefix = "INFO/" 1698 1699 # table variants 1700 table_variants = self.get_table_variants(clause="select") 1701 1702 # extra infos 1703 try: 1704 extra_infos = self.get_extra_infos() 1705 except: 1706 extra_infos = [] 1707 1708 # Header infos 1709 header_infos = self.get_header().infos 1710 1711 log.debug( 1712 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1713 ) 1714 1715 sql_info_alter_table_array = [] 1716 1717 # Info fields to check 1718 fields_list = list(header_infos) 1719 if fields: 1720 fields_list += fields 1721 fields_list = set(fields_list) 1722 1723 # If no fields 1724 if not fields: 1725 fields = [] 1726 1727 # Translate fields if patterns 1728 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1729 1730 for info in fields: 1731 1732 info_id_sql = prefix + info 1733 1734 if ( 1735 info in fields_list 1736 or prefix + info in fields_list 1737 or info in extra_infos 1738 ): 1739 1740 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1741 1742 if info in header_infos: 1743 info_type = header_infos[info].type 1744 info_num = header_infos[info].num 1745 else: 1746 info_type = "String" 1747 info_num = 0 1748 1749 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1750 if info_num != 1: 1751 type_sql = "VARCHAR" 1752 1753 # Add field 1754 added_column = self.add_column( 1755 table_name=table_variants, 1756 column_name=info_id_sql, 1757 column_type=type_sql, 1758 default_value="null", 1759 drop=force, 1760 ) 1761 1762 if added_column: 1763 added_columns.append(added_column) 1764 1765 if added_column or force: 1766 1767 # add field to index 1768 self.index_additionnal_fields.append(info_id_sql) 1769 1770 # Update field array 1771 if connexion_format in ["duckdb"]: 1772 update_info_field = f""" 1773 "{info_id_sql}" = 1774 CASE 1775 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1776 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1777 END 1778 """ 1779 elif connexion_format in ["sqlite"]: 1780 update_info_field = f""" 1781 "{info_id_sql}" = 1782 CASE 1783 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1784 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1785 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1786 END 1787 """ 1788 1789 sql_info_alter_table_array.append(update_info_field) 1790 1791 if sql_info_alter_table_array: 1792 1793 # By chromosomes 1794 try: 1795 chromosomes_list = list( 1796 self.get_query_to_df( 1797 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1798 )["#CHROM"] 1799 ) 1800 except: 1801 chromosomes_list = [None] 1802 1803 for chrom in chromosomes_list: 1804 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1805 1806 # Where clause 1807 where_clause = "" 1808 if chrom and len(chromosomes_list) > 1: 1809 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1810 1811 # Update table 1812 if proccess_all_fields_together: 1813 sql_info_alter_table_array_join = ", ".join( 1814 sql_info_alter_table_array 1815 ) 1816 if sql_info_alter_table_array_join: 1817 sql_info_alter_table = f""" 1818 UPDATE {table_variants} 1819 SET {sql_info_alter_table_array_join} 1820 {where_clause} 1821 """ 1822 log.debug( 1823 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1824 ) 1825 # log.debug(sql_info_alter_table) 1826 self.conn.execute(sql_info_alter_table) 1827 else: 1828 sql_info_alter_num = 0 1829 for sql_info_alter in sql_info_alter_table_array: 1830 sql_info_alter_num += 1 1831 sql_info_alter_table = f""" 1832 UPDATE {table_variants} 1833 SET {sql_info_alter} 1834 {where_clause} 1835 """ 1836 log.debug( 1837 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1838 ) 1839 # log.debug(sql_info_alter_table) 1840 self.conn.execute(sql_info_alter_table) 1841 1842 # create indexes 1843 if create_index: 1844 self.create_indexes() 1845 1846 return added_columns 1847 1848 def create_indexes(self) -> None: 1849 """ 1850 Create indexes on the table after insertion 1851 """ 1852 1853 # Access 1854 access = self.get_config().get("access", None) 1855 1856 # get table variants 1857 table_variants = self.get_table_variants("FROM") 1858 1859 if self.get_indexing() and access not in ["RO"]: 1860 # Create index 1861 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1862 self.conn.execute(sql_create_table_index) 1863 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1864 self.conn.execute(sql_create_table_index) 1865 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1866 self.conn.execute(sql_create_table_index) 1867 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1868 self.conn.execute(sql_create_table_index) 1869 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1870 self.conn.execute(sql_create_table_index) 1871 for field in self.index_additionnal_fields: 1872 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1873 self.conn.execute(sql_create_table_index) 1874 1875 def drop_indexes(self) -> None: 1876 """ 1877 Create indexes on the table after insertion 1878 """ 1879 1880 # Access 1881 access = self.get_config().get("access", None) 1882 1883 # get table variants 1884 table_variants = self.get_table_variants("FROM") 1885 1886 # Get database format 1887 connexion_format = self.get_connexion_format() 1888 1889 if access not in ["RO"]: 1890 if connexion_format in ["duckdb"]: 1891 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1892 elif connexion_format in ["sqlite"]: 1893 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1894 1895 list_indexes = self.conn.execute(sql_list_indexes) 1896 index_names = [row[0] for row in list_indexes.fetchall()] 1897 for index in index_names: 1898 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1899 self.conn.execute(sql_drop_table_index) 1900 1901 def read_vcf_header(self, f) -> list: 1902 """ 1903 It reads the header of a VCF file and returns a list of the header lines 1904 1905 :param f: the file object 1906 :return: The header lines of the VCF file. 1907 """ 1908 1909 header_list = [] 1910 for line in f: 1911 header_list.append(line) 1912 if line.startswith("#CHROM"): 1913 break 1914 return header_list 1915 1916 def read_vcf_header_file(self, file: str = None) -> list: 1917 """ 1918 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1919 uncompressed files. 1920 1921 :param file: The `file` parameter is a string that represents the path to the VCF header file 1922 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1923 default to `None` 1924 :type file: str 1925 :return: The function `read_vcf_header_file` returns a list. 1926 """ 1927 1928 if self.get_input_compressed(input_file=file): 1929 with bgzf.open(file, "rt") as f: 1930 return self.read_vcf_header(f=f) 1931 else: 1932 with open(file, "rt") as f: 1933 return self.read_vcf_header(f=f) 1934 1935 def execute_query(self, query: str): 1936 """ 1937 It takes a query as an argument, executes it, and returns the results 1938 1939 :param query: The query to be executed 1940 :return: The result of the query is being returned. 1941 """ 1942 if query: 1943 return self.conn.execute(query) # .fetchall() 1944 else: 1945 return None 1946 1947 def export_output( 1948 self, 1949 output_file: str | None = None, 1950 output_header: str | None = None, 1951 export_header: bool = True, 1952 query: str | None = None, 1953 parquet_partitions: list | None = None, 1954 chunk_size: int | None = None, 1955 threads: int | None = None, 1956 sort: bool = False, 1957 index: bool = False, 1958 order_by: str | None = None, 1959 ) -> bool: 1960 """ 1961 The `export_output` function exports data from a VCF file to a specified output file in various 1962 formats, including VCF, CSV, TSV, PSV, and Parquet. 1963 1964 :param output_file: The `output_file` parameter is a string that specifies the name of the 1965 output file to be generated by the function. This is where the exported data will be saved 1966 :type output_file: str 1967 :param output_header: The `output_header` parameter is a string that specifies the name of the 1968 file where the header of the VCF file will be exported. If this parameter is not provided, the 1969 header will be exported to a file with the same name as the `output_file` parameter, but with 1970 the extension " 1971 :type output_header: str 1972 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1973 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1974 True, the header will be exported to a file. If `export_header` is False, the header will not 1975 be, defaults to True, if output format is not VCF 1976 :type export_header: bool (optional) 1977 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1978 select specific data from the VCF file before exporting it. If provided, only the data that 1979 matches the query will be exported 1980 :type query: str 1981 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1982 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1983 organize data in a hierarchical directory structure based on the values of one or more columns. 1984 This can improve query performance when working with large datasets 1985 :type parquet_partitions: list 1986 :param chunk_size: The `chunk_size` parameter specifies the number of 1987 records in batch when exporting data in Parquet format. This parameter is used for 1988 partitioning the Parquet file into multiple files. 1989 :type chunk_size: int 1990 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1991 threads to be used during the export process. It determines the level of parallelism and can 1992 improve the performance of the export operation. If not provided, the function will use the 1993 default number of threads 1994 :type threads: int 1995 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1996 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1997 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1998 False 1999 :type sort: bool (optional) 2000 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2001 created on the output file. If `index` is True, an index will be created. If `index` is False, 2002 no index will be created. The default value is False, defaults to False 2003 :type index: bool (optional) 2004 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2005 sorting the output file. This parameter is only applicable when exporting data in VCF format 2006 :type order_by: str 2007 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2008 None if it doesn't. 2009 """ 2010 2011 # Log 2012 log.info("Exporting...") 2013 2014 # Full path 2015 output_file = full_path(output_file) 2016 output_header = full_path(output_header) 2017 2018 # Config 2019 config = self.get_config() 2020 2021 # Param 2022 param = self.get_param() 2023 2024 # Tmp files to remove 2025 tmp_to_remove = [] 2026 2027 # If no output, get it 2028 if not output_file: 2029 output_file = self.get_output() 2030 2031 # If not threads 2032 if not threads: 2033 threads = self.get_threads() 2034 2035 # Auto header name with extension 2036 if export_header or output_header: 2037 if not output_header: 2038 output_header = f"{output_file}.hdr" 2039 # Export header 2040 self.export_header(output_file=output_file) 2041 2042 # Switch off export header if VCF output 2043 output_file_type = get_file_format(output_file) 2044 if output_file_type in ["vcf"]: 2045 export_header = False 2046 tmp_to_remove.append(output_header) 2047 2048 # Chunk size 2049 if not chunk_size: 2050 chunk_size = config.get("chunk_size", None) 2051 2052 # Parquet partition 2053 if not parquet_partitions: 2054 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2055 if parquet_partitions and isinstance(parquet_partitions, str): 2056 parquet_partitions = parquet_partitions.split(",") 2057 2058 # Order by 2059 if not order_by: 2060 order_by = param.get("export", {}).get("order_by", "") 2061 2062 # Header in output 2063 header_in_output = param.get("export", {}).get("include_header", False) 2064 2065 # Database 2066 database_source = self.get_connexion() 2067 2068 # Connexion format 2069 connexion_format = self.get_connexion_format() 2070 2071 # Explode infos 2072 if self.get_explode_infos(): 2073 self.explode_infos( 2074 prefix=self.get_explode_infos_prefix(), 2075 fields=self.get_explode_infos_fields(), 2076 force=False, 2077 ) 2078 2079 # if connexion_format in ["sqlite"] or query: 2080 if connexion_format in ["sqlite"]: 2081 2082 # Export in Parquet 2083 random_tmp = "".join( 2084 random.choice(string.ascii_lowercase) for i in range(10) 2085 ) 2086 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2087 tmp_to_remove.append(database_source) 2088 2089 # Table Variants 2090 table_variants = self.get_table_variants() 2091 2092 # Create export query 2093 sql_query_export_subquery = f""" 2094 SELECT * FROM {table_variants} 2095 """ 2096 2097 # Write source file 2098 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2099 2100 # Create database 2101 database = Database( 2102 database=database_source, 2103 table="variants", 2104 header_file=output_header, 2105 conn_config=self.get_connexion_config(), 2106 ) 2107 2108 # Existing colomns header 2109 # existing_columns_header = database.get_header_file_columns(output_header) 2110 existing_columns_header = database.get_header_columns_from_database() 2111 2112 # Export file 2113 database.export( 2114 output_database=output_file, 2115 output_header=output_header, 2116 existing_columns_header=existing_columns_header, 2117 parquet_partitions=parquet_partitions, 2118 chunk_size=chunk_size, 2119 threads=threads, 2120 sort=sort, 2121 index=index, 2122 header_in_output=header_in_output, 2123 order_by=order_by, 2124 query=query, 2125 export_header=export_header, 2126 ) 2127 2128 # Remove 2129 remove_if_exists(tmp_to_remove) 2130 2131 return (os.path.exists(output_file) or None) and ( 2132 os.path.exists(output_file) or None 2133 ) 2134 2135 def get_extra_infos(self, table: str = None) -> list: 2136 """ 2137 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2138 in the header. 2139 2140 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2141 name of the table from which you want to retrieve the extra columns that are not present in the 2142 header. If the `table` parameter is not provided when calling the function, it will default to 2143 using the variants 2144 :type table: str 2145 :return: A list of columns that are in the specified table but not in the header of the table. 2146 """ 2147 2148 header_columns = [] 2149 2150 if not table: 2151 table = self.get_table_variants(clause="from") 2152 header_columns = self.get_header_columns() 2153 2154 # Check all columns in the database 2155 query = f""" SELECT * FROM {table} LIMIT 1 """ 2156 log.debug(f"query {query}") 2157 table_columns = self.get_query_to_df(query).columns.tolist() 2158 extra_columns = [] 2159 2160 # Construct extra infos (not in header) 2161 for column in table_columns: 2162 if column not in header_columns: 2163 extra_columns.append(column) 2164 2165 return extra_columns 2166 2167 def get_extra_infos_sql(self, table: str = None) -> str: 2168 """ 2169 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2170 by double quotes 2171 2172 :param table: The name of the table to get the extra infos from. If None, the default table is 2173 used 2174 :type table: str 2175 :return: A string of the extra infos 2176 """ 2177 2178 return ", ".join( 2179 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2180 ) 2181 2182 def export_header( 2183 self, 2184 header_name: str = None, 2185 output_file: str = None, 2186 output_file_ext: str = ".hdr", 2187 clean_header: bool = True, 2188 remove_chrom_line: bool = False, 2189 ) -> str: 2190 """ 2191 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2192 specified options, and writes it to a new file. 2193 2194 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2195 this parameter is not specified, the header will be written to the output file 2196 :type header_name: str 2197 :param output_file: The `output_file` parameter in the `export_header` function is used to 2198 specify the name of the output file where the header will be written. If this parameter is not 2199 provided, the header will be written to a temporary file 2200 :type output_file: str 2201 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2202 string that represents the extension of the output header file. By default, it is set to ".hdr" 2203 if not specified by the user. This extension will be appended to the `output_file` name to 2204 create the final, defaults to .hdr 2205 :type output_file_ext: str (optional) 2206 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2207 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2208 `True`, the function will clean the header by modifying certain lines based on a specific 2209 pattern. If `clean_header`, defaults to True 2210 :type clean_header: bool (optional) 2211 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2212 boolean flag that determines whether the #CHROM line should be removed from the header before 2213 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2214 defaults to False 2215 :type remove_chrom_line: bool (optional) 2216 :return: The function `export_header` returns the name of the temporary header file that is 2217 created. 2218 """ 2219 2220 if not header_name and not output_file: 2221 output_file = self.get_output() 2222 2223 if self.get_header(): 2224 2225 # Get header object 2226 header_obj = self.get_header() 2227 2228 # Create database 2229 db_for_header = Database(database=self.get_input()) 2230 2231 # Get real columns in the file 2232 db_header_columns = db_for_header.get_columns() 2233 2234 with tempfile.TemporaryDirectory() as tmpdir: 2235 2236 # Write header file 2237 header_file_tmp = os.path.join(tmpdir, "header") 2238 f = open(header_file_tmp, "w") 2239 vcf.Writer(f, header_obj) 2240 f.close() 2241 2242 # Replace #CHROM line with rel columns 2243 header_list = db_for_header.read_header_file( 2244 header_file=header_file_tmp 2245 ) 2246 header_list[-1] = "\t".join(db_header_columns) 2247 2248 # Remove CHROM line 2249 if remove_chrom_line: 2250 header_list.pop() 2251 2252 # Clean header 2253 if clean_header: 2254 header_list_clean = [] 2255 for head in header_list: 2256 # Clean head for malformed header 2257 head_clean = head 2258 head_clean = re.subn( 2259 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2260 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2261 head_clean, 2262 2, 2263 )[0] 2264 # Write header 2265 header_list_clean.append(head_clean) 2266 header_list = header_list_clean 2267 2268 tmp_header_name = output_file + output_file_ext 2269 2270 f = open(tmp_header_name, "w") 2271 for line in header_list: 2272 f.write(line) 2273 f.close() 2274 2275 return tmp_header_name 2276 2277 def export_variant_vcf( 2278 self, 2279 vcf_file, 2280 remove_info: bool = False, 2281 add_samples: bool = True, 2282 list_samples: list = [], 2283 where_clause: str = "", 2284 index: bool = False, 2285 threads: int | None = None, 2286 ) -> bool | None: 2287 """ 2288 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2289 remove INFO field, add samples, and control compression and indexing. 2290 2291 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2292 written to. It is the output file that will contain the filtered VCF data based on the specified 2293 parameters 2294 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2295 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2296 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2297 in, defaults to False 2298 :type remove_info: bool (optional) 2299 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2300 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2301 If set to False, the samples will be removed. The default value is True, defaults to True 2302 :type add_samples: bool (optional) 2303 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2304 in the output VCF file. By default, all samples will be included. If you provide a list of 2305 samples, only those samples will be included in the output file 2306 :type list_samples: list 2307 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2308 determines whether or not to create an index for the output VCF file. If `index` is set to 2309 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2310 :type index: bool (optional) 2311 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2312 number of threads to use for exporting the VCF file. It determines how many parallel threads 2313 will be used during the export process. More threads can potentially speed up the export process 2314 by utilizing multiple cores of the processor. If 2315 :type threads: int | None 2316 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2317 method with various parameters including the output file, query, threads, sort flag, and index 2318 flag. The `export_output` method is responsible for exporting the VCF data based on the 2319 specified parameters and configurations provided in the `export_variant_vcf` function. 2320 """ 2321 2322 # Config 2323 config = self.get_config() 2324 2325 # Extract VCF 2326 log.debug("Export VCF...") 2327 2328 # Table variants 2329 table_variants = self.get_table_variants() 2330 2331 # Threads 2332 if not threads: 2333 threads = self.get_threads() 2334 2335 # Info fields 2336 if remove_info: 2337 if not isinstance(remove_info, str): 2338 remove_info = "." 2339 info_field = f"""'{remove_info}' as INFO""" 2340 else: 2341 info_field = "INFO" 2342 2343 # Samples fields 2344 if add_samples: 2345 if not list_samples: 2346 list_samples = self.get_header_sample_list() 2347 if list_samples: 2348 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2349 else: 2350 samples_fields = "" 2351 log.debug(f"samples_fields: {samples_fields}") 2352 else: 2353 samples_fields = "" 2354 2355 # Where clause 2356 if where_clause is None: 2357 where_clause = "" 2358 2359 # Variants 2360 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2361 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2362 log.debug(f"sql_query_select={sql_query_select}") 2363 2364 return self.export_output( 2365 output_file=vcf_file, 2366 output_header=None, 2367 export_header=True, 2368 query=sql_query_select, 2369 parquet_partitions=None, 2370 chunk_size=config.get("chunk_size", None), 2371 threads=threads, 2372 sort=True, 2373 index=index, 2374 order_by=None, 2375 ) 2376 2377 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2378 """ 2379 It takes a list of commands and runs them in parallel using the number of threads specified 2380 2381 :param commands: A list of commands to run 2382 :param threads: The number of threads to use, defaults to 1 (optional) 2383 """ 2384 2385 run_parallel_commands(commands, threads) 2386 2387 def get_threads(self, default: int = 1) -> int: 2388 """ 2389 This function returns the number of threads to use for a job, with a default value of 1 if not 2390 specified. 2391 2392 :param default: The `default` parameter in the `get_threads` method is used to specify the 2393 default number of threads to use if no specific value is provided. If no value is provided for 2394 the `threads` parameter in the configuration or input parameters, the `default` value will be 2395 used, defaults to 1 2396 :type default: int (optional) 2397 :return: the number of threads to use for the current job. 2398 """ 2399 2400 # Config 2401 config = self.get_config() 2402 2403 # Param 2404 param = self.get_param() 2405 2406 # Input threads 2407 input_thread = param.get("threads", config.get("threads", None)) 2408 2409 # Check threads 2410 if not input_thread: 2411 threads = default 2412 elif int(input_thread) <= 0: 2413 threads = os.cpu_count() 2414 else: 2415 threads = int(input_thread) 2416 return threads 2417 2418 def get_memory(self, default: str = None) -> str: 2419 """ 2420 This function retrieves the memory value from parameters or configuration with a default value 2421 if not found. 2422 2423 :param default: The `get_memory` function takes in a default value as a string parameter. This 2424 default value is used as a fallback in case the `memory` parameter is not provided in the 2425 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2426 the function 2427 :type default: str 2428 :return: The `get_memory` function returns a string value representing the memory parameter. If 2429 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2430 return the default value provided as an argument to the function. 2431 """ 2432 2433 # Config 2434 config = self.get_config() 2435 2436 # Param 2437 param = self.get_param() 2438 2439 # Input threads 2440 input_memory = param.get("memory", config.get("memory", None)) 2441 2442 # Check threads 2443 if input_memory: 2444 memory = input_memory 2445 else: 2446 memory = default 2447 2448 return memory 2449 2450 def update_from_vcf(self, vcf_file: str) -> None: 2451 """ 2452 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2453 2454 :param vcf_file: the path to the VCF file 2455 """ 2456 2457 connexion_format = self.get_connexion_format() 2458 2459 if connexion_format in ["duckdb"]: 2460 self.update_from_vcf_duckdb(vcf_file) 2461 elif connexion_format in ["sqlite"]: 2462 self.update_from_vcf_sqlite(vcf_file) 2463 2464 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2465 """ 2466 It takes a VCF file and updates the INFO column of the variants table in the database with the 2467 INFO column of the VCF file 2468 2469 :param vcf_file: the path to the VCF file 2470 """ 2471 2472 # varaints table 2473 table_variants = self.get_table_variants() 2474 2475 # Loading VCF into temporaire table 2476 skip = self.get_header_length(file=vcf_file) 2477 vcf_df = pd.read_csv( 2478 vcf_file, 2479 sep="\t", 2480 engine="c", 2481 skiprows=skip, 2482 header=0, 2483 low_memory=False, 2484 ) 2485 sql_query_update = f""" 2486 UPDATE {table_variants} as table_variants 2487 SET INFO = concat( 2488 CASE 2489 WHEN INFO NOT IN ('', '.') 2490 THEN INFO 2491 ELSE '' 2492 END, 2493 ( 2494 SELECT 2495 concat( 2496 CASE 2497 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2498 THEN ';' 2499 ELSE '' 2500 END 2501 , 2502 CASE 2503 WHEN table_parquet.INFO NOT IN ('','.') 2504 THEN table_parquet.INFO 2505 ELSE '' 2506 END 2507 ) 2508 FROM vcf_df as table_parquet 2509 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2510 AND table_parquet.\"POS\" = table_variants.\"POS\" 2511 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2512 AND table_parquet.\"REF\" = table_variants.\"REF\" 2513 AND table_parquet.INFO NOT IN ('','.') 2514 ) 2515 ) 2516 ; 2517 """ 2518 self.conn.execute(sql_query_update) 2519 2520 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2521 """ 2522 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2523 table, then updates the INFO column of the variants table with the INFO column of the temporary 2524 table 2525 2526 :param vcf_file: The path to the VCF file you want to update the database with 2527 """ 2528 2529 # Create a temporary table for the VCF 2530 table_vcf = "tmp_vcf" 2531 sql_create = ( 2532 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2533 ) 2534 self.conn.execute(sql_create) 2535 2536 # Loading VCF into temporaire table 2537 vcf_df = pd.read_csv( 2538 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2539 ) 2540 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2541 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2542 2543 # Update table 'variants' with VCF data 2544 # warning: CONCAT as || operator 2545 sql_query_update = f""" 2546 UPDATE variants as table_variants 2547 SET INFO = CASE 2548 WHEN INFO NOT IN ('', '.') 2549 THEN INFO 2550 ELSE '' 2551 END || 2552 ( 2553 SELECT 2554 CASE 2555 WHEN table_variants.INFO NOT IN ('','.') 2556 AND table_vcf.INFO NOT IN ('','.') 2557 THEN ';' 2558 ELSE '' 2559 END || 2560 CASE 2561 WHEN table_vcf.INFO NOT IN ('','.') 2562 THEN table_vcf.INFO 2563 ELSE '' 2564 END 2565 FROM {table_vcf} as table_vcf 2566 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2567 AND table_vcf.\"POS\" = table_variants.\"POS\" 2568 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2569 AND table_vcf.\"REF\" = table_variants.\"REF\" 2570 ) 2571 """ 2572 self.conn.execute(sql_query_update) 2573 2574 # Drop temporary table 2575 sql_drop = f"DROP TABLE {table_vcf}" 2576 self.conn.execute(sql_drop) 2577 2578 def drop_variants_table(self) -> None: 2579 """ 2580 > This function drops the variants table 2581 """ 2582 2583 table_variants = self.get_table_variants() 2584 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2585 self.conn.execute(sql_table_variants) 2586 2587 def set_variant_id( 2588 self, variant_id_column: str = "variant_id", force: bool = None 2589 ) -> str: 2590 """ 2591 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2592 `#CHROM`, `POS`, `REF`, and `ALT` columns 2593 2594 :param variant_id_column: The name of the column to be created in the variants table, defaults 2595 to variant_id 2596 :type variant_id_column: str (optional) 2597 :param force: If True, the variant_id column will be created even if it already exists 2598 :type force: bool 2599 :return: The name of the column that contains the variant_id 2600 """ 2601 2602 # Assembly 2603 assembly = self.get_param().get( 2604 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2605 ) 2606 2607 # INFO/Tag prefix 2608 prefix = self.get_explode_infos_prefix() 2609 2610 # Explode INFO/SVTYPE 2611 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2612 2613 # variants table 2614 table_variants = self.get_table_variants() 2615 2616 # variant_id column 2617 if not variant_id_column: 2618 variant_id_column = "variant_id" 2619 2620 # Creta variant_id column 2621 if "variant_id" not in self.get_extra_infos() or force: 2622 2623 # Create column 2624 self.add_column( 2625 table_name=table_variants, 2626 column_name=variant_id_column, 2627 column_type="UBIGINT", 2628 default_value="0", 2629 ) 2630 2631 # Update column 2632 self.conn.execute( 2633 f""" 2634 UPDATE {table_variants} 2635 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2636 """ 2637 ) 2638 2639 # Remove added columns 2640 for added_column in added_columns: 2641 self.drop_column(column=added_column) 2642 2643 # return variant_id column name 2644 return variant_id_column 2645 2646 def get_variant_id_column( 2647 self, variant_id_column: str = "variant_id", force: bool = None 2648 ) -> str: 2649 """ 2650 This function returns the variant_id column name 2651 2652 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2653 defaults to variant_id 2654 :type variant_id_column: str (optional) 2655 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2656 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2657 if it is not already set, or if it is set 2658 :type force: bool 2659 :return: The variant_id column name. 2660 """ 2661 2662 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2663 2664 ### 2665 # Annotation 2666 ### 2667 2668 def scan_databases( 2669 self, 2670 database_formats: list = ["parquet"], 2671 database_releases: list = ["current"], 2672 ) -> dict: 2673 """ 2674 The function `scan_databases` scans for available databases based on specified formats and 2675 releases. 2676 2677 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2678 of the databases to be scanned. In this case, the accepted format is "parquet" 2679 :type database_formats: list ["parquet"] 2680 :param database_releases: The `database_releases` parameter is a list that specifies the 2681 releases of the databases to be scanned. In the provided function, the default value for 2682 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2683 databases that are in the "current" 2684 :type database_releases: list 2685 :return: The function `scan_databases` returns a dictionary containing information about 2686 databases that match the specified formats and releases. 2687 """ 2688 2689 # Config 2690 config = self.get_config() 2691 2692 # Param 2693 param = self.get_param() 2694 2695 # Param - Assembly 2696 assembly = param.get("assembly", config.get("assembly", None)) 2697 if not assembly: 2698 assembly = DEFAULT_ASSEMBLY 2699 log.warning(f"Default assembly '{assembly}'") 2700 2701 # Scan for availabled databases 2702 log.info( 2703 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2704 ) 2705 databases_infos_dict = databases_infos( 2706 database_folder_releases=database_releases, 2707 database_formats=database_formats, 2708 assembly=assembly, 2709 config=config, 2710 ) 2711 log.info( 2712 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2713 ) 2714 2715 return databases_infos_dict 2716 2717 def annotation(self) -> None: 2718 """ 2719 It annotates the VCF file with the annotations specified in the config file. 2720 """ 2721 2722 # Config 2723 config = self.get_config() 2724 2725 # Param 2726 param = self.get_param() 2727 2728 # Param - Assembly 2729 assembly = param.get("assembly", config.get("assembly", None)) 2730 if not assembly: 2731 assembly = DEFAULT_ASSEMBLY 2732 log.warning(f"Default assembly '{assembly}'") 2733 2734 # annotations databases folders 2735 annotations_databases = set( 2736 config.get("folders", {}) 2737 .get("databases", {}) 2738 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2739 + config.get("folders", {}) 2740 .get("databases", {}) 2741 .get("parquet", ["~/howard/databases/parquet/current"]) 2742 + config.get("folders", {}) 2743 .get("databases", {}) 2744 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2745 ) 2746 2747 # Get param annotations 2748 if param.get("annotations", None) and isinstance( 2749 param.get("annotations", None), str 2750 ): 2751 log.debug(param.get("annotations", None)) 2752 param_annotation_list = param.get("annotations").split(",") 2753 else: 2754 param_annotation_list = [] 2755 2756 # Each tools param 2757 if param.get("annotation_parquet", None) != None: 2758 log.debug( 2759 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2760 ) 2761 if isinstance(param.get("annotation_parquet", None), list): 2762 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2763 else: 2764 param_annotation_list.append(param.get("annotation_parquet")) 2765 if param.get("annotation_snpsift", None) != None: 2766 if isinstance(param.get("annotation_snpsift", None), list): 2767 param_annotation_list.append( 2768 "snpsift:" 2769 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2770 ) 2771 else: 2772 param_annotation_list.append( 2773 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2774 ) 2775 if param.get("annotation_snpeff", None) != None: 2776 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2777 if param.get("annotation_bcftools", None) != None: 2778 if isinstance(param.get("annotation_bcftools", None), list): 2779 param_annotation_list.append( 2780 "bcftools:" 2781 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2786 ) 2787 if param.get("annotation_annovar", None) != None: 2788 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2789 if param.get("annotation_exomiser", None) != None: 2790 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2791 if param.get("annotation_splice", None) != None: 2792 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2793 2794 # Merge param annotations list 2795 param["annotations"] = ",".join(param_annotation_list) 2796 2797 # debug 2798 log.debug(f"param_annotations={param['annotations']}") 2799 2800 if param.get("annotations"): 2801 2802 # Log 2803 # log.info("Annotations - Check annotation parameters") 2804 2805 if not "annotation" in param: 2806 param["annotation"] = {} 2807 2808 # List of annotations parameters 2809 annotations_list_input = {} 2810 if isinstance(param.get("annotations", None), str): 2811 annotation_file_list = [ 2812 value for value in param.get("annotations", "").split(",") 2813 ] 2814 for annotation_file in annotation_file_list: 2815 annotations_list_input[annotation_file] = {"INFO": None} 2816 else: 2817 annotations_list_input = param.get("annotations", {}) 2818 2819 log.info(f"Quick Annotations:") 2820 for annotation_key in list(annotations_list_input.keys()): 2821 log.info(f" {annotation_key}") 2822 2823 # List of annotations and associated fields 2824 annotations_list = {} 2825 2826 for annotation_file in annotations_list_input: 2827 2828 # Explode annotations if ALL 2829 if ( 2830 annotation_file.upper() == "ALL" 2831 or annotation_file.upper().startswith("ALL:") 2832 ): 2833 2834 # check ALL parameters (formats, releases) 2835 annotation_file_split = annotation_file.split(":") 2836 database_formats = "parquet" 2837 database_releases = "current" 2838 for annotation_file_option in annotation_file_split[1:]: 2839 database_all_options_split = annotation_file_option.split("=") 2840 if database_all_options_split[0] == "format": 2841 database_formats = database_all_options_split[1].split("+") 2842 if database_all_options_split[0] == "release": 2843 database_releases = database_all_options_split[1].split("+") 2844 2845 # Scan for availabled databases 2846 databases_infos_dict = self.scan_databases( 2847 database_formats=database_formats, 2848 database_releases=database_releases, 2849 ) 2850 2851 # Add found databases in annotation parameters 2852 for database_infos in databases_infos_dict.keys(): 2853 annotations_list[database_infos] = {"INFO": None} 2854 2855 else: 2856 annotations_list[annotation_file] = annotations_list_input[ 2857 annotation_file 2858 ] 2859 2860 # Check each databases 2861 if len(annotations_list): 2862 2863 log.info( 2864 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2865 ) 2866 2867 for annotation_file in annotations_list: 2868 2869 # Init 2870 annotations = annotations_list.get(annotation_file, None) 2871 2872 # Annotation snpEff 2873 if annotation_file.startswith("snpeff"): 2874 2875 log.debug(f"Quick Annotation snpEff") 2876 2877 if "snpeff" not in param["annotation"]: 2878 param["annotation"]["snpeff"] = {} 2879 2880 if "options" not in param["annotation"]["snpeff"]: 2881 param["annotation"]["snpeff"]["options"] = "" 2882 2883 # snpEff options in annotations 2884 param["annotation"]["snpeff"]["options"] = "".join( 2885 annotation_file.split(":")[1:] 2886 ) 2887 2888 # Annotation Annovar 2889 elif annotation_file.startswith("annovar"): 2890 2891 log.debug(f"Quick Annotation Annovar") 2892 2893 if "annovar" not in param["annotation"]: 2894 param["annotation"]["annovar"] = {} 2895 2896 if "annotations" not in param["annotation"]["annovar"]: 2897 param["annotation"]["annovar"]["annotations"] = {} 2898 2899 # Options 2900 annotation_file_split = annotation_file.split(":") 2901 for annotation_file_annotation in annotation_file_split[1:]: 2902 if annotation_file_annotation: 2903 param["annotation"]["annovar"]["annotations"][ 2904 annotation_file_annotation 2905 ] = annotations 2906 2907 # Annotation Exomiser 2908 elif annotation_file.startswith("exomiser"): 2909 2910 log.debug(f"Quick Annotation Exomiser") 2911 2912 param["annotation"]["exomiser"] = params_string_to_dict( 2913 annotation_file 2914 ) 2915 2916 # Annotation Splice 2917 elif annotation_file.startswith("splice"): 2918 2919 log.debug(f"Quick Annotation Splice") 2920 2921 param["annotation"]["splice"] = params_string_to_dict( 2922 annotation_file 2923 ) 2924 2925 # Annotation Parquet or BCFTOOLS 2926 else: 2927 2928 # Tools detection 2929 if annotation_file.startswith("bcftools:"): 2930 annotation_tool_initial = "bcftools" 2931 annotation_file = ":".join(annotation_file.split(":")[1:]) 2932 elif annotation_file.startswith("snpsift:"): 2933 annotation_tool_initial = "snpsift" 2934 annotation_file = ":".join(annotation_file.split(":")[1:]) 2935 else: 2936 annotation_tool_initial = None 2937 2938 # list of files 2939 annotation_file_list = annotation_file.replace("+", ":").split( 2940 ":" 2941 ) 2942 2943 for annotation_file in annotation_file_list: 2944 2945 if annotation_file: 2946 2947 # Annotation tool initial 2948 annotation_tool = annotation_tool_initial 2949 2950 # Find file 2951 annotation_file_found = None 2952 2953 # Expand user 2954 annotation_file = full_path(annotation_file) 2955 2956 if os.path.exists(annotation_file): 2957 annotation_file_found = annotation_file 2958 2959 else: 2960 # Find within assembly folders 2961 for annotations_database in annotations_databases: 2962 found_files = find_all( 2963 annotation_file, 2964 os.path.join( 2965 annotations_database, assembly 2966 ), 2967 ) 2968 if len(found_files) > 0: 2969 annotation_file_found = found_files[0] 2970 break 2971 if not annotation_file_found and not assembly: 2972 # Find within folders 2973 for ( 2974 annotations_database 2975 ) in annotations_databases: 2976 found_files = find_all( 2977 annotation_file, annotations_database 2978 ) 2979 if len(found_files) > 0: 2980 annotation_file_found = found_files[0] 2981 break 2982 log.debug( 2983 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2984 ) 2985 2986 # Full path 2987 annotation_file_found = full_path(annotation_file_found) 2988 2989 if annotation_file_found: 2990 2991 database = Database(database=annotation_file_found) 2992 quick_annotation_format = database.get_format() 2993 quick_annotation_is_compressed = ( 2994 database.is_compressed() 2995 ) 2996 quick_annotation_is_indexed = os.path.exists( 2997 f"{annotation_file_found}.tbi" 2998 ) 2999 bcftools_preference = False 3000 3001 # Check Annotation Tool 3002 if not annotation_tool: 3003 if ( 3004 bcftools_preference 3005 and quick_annotation_format 3006 in ["vcf", "bed"] 3007 and quick_annotation_is_compressed 3008 and quick_annotation_is_indexed 3009 ): 3010 annotation_tool = "bcftools" 3011 elif quick_annotation_format in [ 3012 "vcf", 3013 "bed", 3014 "tsv", 3015 "tsv", 3016 "csv", 3017 "json", 3018 "tbl", 3019 "parquet", 3020 "duckdb", 3021 ]: 3022 annotation_tool = "parquet" 3023 else: 3024 log.error( 3025 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3026 ) 3027 raise ValueError( 3028 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3029 ) 3030 3031 log.debug( 3032 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3033 ) 3034 3035 # Annotation Tool dispatch 3036 if annotation_tool: 3037 if annotation_tool not in param["annotation"]: 3038 param["annotation"][annotation_tool] = {} 3039 if ( 3040 "annotations" 3041 not in param["annotation"][annotation_tool] 3042 ): 3043 param["annotation"][annotation_tool][ 3044 "annotations" 3045 ] = {} 3046 param["annotation"][annotation_tool][ 3047 "annotations" 3048 ][annotation_file_found] = annotations 3049 3050 else: 3051 log.error( 3052 f"Quick Annotation File {annotation_file} does NOT exist" 3053 ) 3054 3055 self.set_param(param) 3056 3057 if param.get("annotation", None): 3058 log.info("Annotations") 3059 if param.get("annotation", {}).get("parquet", None): 3060 log.info("Annotations 'parquet'...") 3061 self.annotation_parquet() 3062 if param.get("annotation", {}).get("bcftools", None): 3063 log.info("Annotations 'bcftools'...") 3064 self.annotation_bcftools() 3065 if param.get("annotation", {}).get("snpsift", None): 3066 log.info("Annotations 'snpsift'...") 3067 self.annotation_snpsift() 3068 if param.get("annotation", {}).get("annovar", None): 3069 log.info("Annotations 'annovar'...") 3070 self.annotation_annovar() 3071 if param.get("annotation", {}).get("snpeff", None): 3072 log.info("Annotations 'snpeff'...") 3073 self.annotation_snpeff() 3074 if param.get("annotation", {}).get("exomiser", None) is not None: 3075 log.info("Annotations 'exomiser'...") 3076 self.annotation_exomiser() 3077 if param.get("annotation", {}).get("splice", None) is not None: 3078 log.info("Annotations 'splice' ...") 3079 self.annotation_splice() 3080 3081 # Explode INFOS fields into table fields 3082 if self.get_explode_infos(): 3083 self.explode_infos( 3084 prefix=self.get_explode_infos_prefix(), 3085 fields=self.get_explode_infos_fields(), 3086 force=True, 3087 ) 3088 3089 def annotation_snpsift(self, threads: int = None) -> None: 3090 """ 3091 This function annotate with bcftools 3092 3093 :param threads: Number of threads to use 3094 :return: the value of the variable "return_value". 3095 """ 3096 3097 # DEBUG 3098 log.debug("Start annotation with bcftools databases") 3099 3100 # Threads 3101 if not threads: 3102 threads = self.get_threads() 3103 log.debug("Threads: " + str(threads)) 3104 3105 # Config 3106 config = self.get_config() 3107 log.debug("Config: " + str(config)) 3108 3109 # Config - snpSift 3110 snpsift_bin_command = get_bin_command( 3111 bin="SnpSift.jar", 3112 tool="snpsift", 3113 bin_type="jar", 3114 config=config, 3115 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3116 ) 3117 if not snpsift_bin_command: 3118 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3119 log.error(msg_err) 3120 raise ValueError(msg_err) 3121 3122 # Config - bcftools 3123 bcftools_bin_command = get_bin_command( 3124 bin="bcftools", 3125 tool="bcftools", 3126 bin_type="bin", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3129 ) 3130 if not bcftools_bin_command: 3131 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - BCFTools databases folders 3136 databases_folders = set( 3137 self.get_config() 3138 .get("folders", {}) 3139 .get("databases", {}) 3140 .get("annotations", ["."]) 3141 + self.get_config() 3142 .get("folders", {}) 3143 .get("databases", {}) 3144 .get("bcftools", ["."]) 3145 ) 3146 log.debug("Databases annotations: " + str(databases_folders)) 3147 3148 # Param 3149 annotations = ( 3150 self.get_param() 3151 .get("annotation", {}) 3152 .get("snpsift", {}) 3153 .get("annotations", None) 3154 ) 3155 log.debug("Annotations: " + str(annotations)) 3156 3157 # Assembly 3158 assembly = self.get_param().get( 3159 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3160 ) 3161 3162 # Data 3163 table_variants = self.get_table_variants() 3164 3165 # Check if not empty 3166 log.debug("Check if not empty") 3167 sql_query_chromosomes = ( 3168 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3169 ) 3170 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3171 if not sql_query_chromosomes_df["count"][0]: 3172 log.info(f"VCF empty") 3173 return 3174 3175 # VCF header 3176 vcf_reader = self.get_header() 3177 log.debug("Initial header: " + str(vcf_reader.infos)) 3178 3179 # Existing annotations 3180 for vcf_annotation in self.get_header().infos: 3181 3182 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3183 log.debug( 3184 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3185 ) 3186 3187 if annotations: 3188 3189 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3190 3191 # Export VCF file 3192 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3193 3194 # Init 3195 commands = {} 3196 3197 for annotation in annotations: 3198 annotation_fields = annotations[annotation] 3199 3200 # Annotation Name 3201 annotation_name = os.path.basename(annotation) 3202 3203 if not annotation_fields: 3204 annotation_fields = {"INFO": None} 3205 3206 log.debug(f"Annotation '{annotation_name}'") 3207 log.debug( 3208 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3209 ) 3210 3211 # Create Database 3212 database = Database( 3213 database=annotation, 3214 databases_folders=databases_folders, 3215 assembly=assembly, 3216 ) 3217 3218 # Find files 3219 db_file = database.get_database() 3220 db_file = full_path(db_file) 3221 db_hdr_file = database.get_header_file() 3222 db_hdr_file = full_path(db_hdr_file) 3223 db_file_type = database.get_format() 3224 db_tbi_file = f"{db_file}.tbi" 3225 db_file_compressed = database.is_compressed() 3226 3227 # Check if compressed 3228 if not db_file_compressed: 3229 log.error( 3230 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3231 ) 3232 raise ValueError( 3233 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3234 ) 3235 3236 # Check if indexed 3237 if not os.path.exists(db_tbi_file): 3238 log.error( 3239 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3240 ) 3241 raise ValueError( 3242 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3243 ) 3244 3245 # Check index - try to create if not exists 3246 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3247 log.error("Annotation failed: database not valid") 3248 log.error(f"Annotation annotation file: {db_file}") 3249 log.error(f"Annotation annotation header: {db_hdr_file}") 3250 log.error(f"Annotation annotation index: {db_tbi_file}") 3251 raise ValueError( 3252 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3253 ) 3254 else: 3255 3256 log.debug( 3257 f"Annotation '{annotation}' - file: " 3258 + str(db_file) 3259 + " and " 3260 + str(db_hdr_file) 3261 ) 3262 3263 # Load header as VCF object 3264 db_hdr_vcf = Variants(input=db_hdr_file) 3265 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3266 log.debug( 3267 "Annotation database header: " 3268 + str(db_hdr_vcf_header_infos) 3269 ) 3270 3271 # For all fields in database 3272 annotation_fields_full = False 3273 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3274 annotation_fields = { 3275 key: key for key in db_hdr_vcf_header_infos 3276 } 3277 log.debug( 3278 "Annotation database header - All annotations added: " 3279 + str(annotation_fields) 3280 ) 3281 annotation_fields_full = True 3282 3283 # # Create file for field rename 3284 # log.debug("Create file for field rename") 3285 # tmp_rename = NamedTemporaryFile( 3286 # prefix=self.get_prefix(), 3287 # dir=self.get_tmp_dir(), 3288 # suffix=".rename", 3289 # delete=False, 3290 # ) 3291 # tmp_rename_name = tmp_rename.name 3292 # tmp_files.append(tmp_rename_name) 3293 3294 # Number of fields 3295 nb_annotation_field = 0 3296 annotation_list = [] 3297 annotation_infos_rename_list = [] 3298 3299 for annotation_field in annotation_fields: 3300 3301 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3302 annotation_fields_new_name = annotation_fields.get( 3303 annotation_field, annotation_field 3304 ) 3305 if not annotation_fields_new_name: 3306 annotation_fields_new_name = annotation_field 3307 3308 # Check if field is in DB and if field is not elready in input data 3309 if ( 3310 annotation_field in db_hdr_vcf.get_header().infos 3311 and annotation_fields_new_name 3312 not in self.get_header().infos 3313 ): 3314 3315 log.info( 3316 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3317 ) 3318 3319 # BCFTools annotate param to rename fields 3320 if annotation_field != annotation_fields_new_name: 3321 annotation_infos_rename_list.append( 3322 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3323 ) 3324 3325 # Add INFO field to header 3326 db_hdr_vcf_header_infos_number = ( 3327 db_hdr_vcf_header_infos[annotation_field].num or "." 3328 ) 3329 db_hdr_vcf_header_infos_type = ( 3330 db_hdr_vcf_header_infos[annotation_field].type 3331 or "String" 3332 ) 3333 db_hdr_vcf_header_infos_description = ( 3334 db_hdr_vcf_header_infos[annotation_field].desc 3335 or f"{annotation_field} description" 3336 ) 3337 db_hdr_vcf_header_infos_source = ( 3338 db_hdr_vcf_header_infos[annotation_field].source 3339 or "unknown" 3340 ) 3341 db_hdr_vcf_header_infos_version = ( 3342 db_hdr_vcf_header_infos[annotation_field].version 3343 or "unknown" 3344 ) 3345 3346 vcf_reader.infos[annotation_fields_new_name] = ( 3347 vcf.parser._Info( 3348 annotation_fields_new_name, 3349 db_hdr_vcf_header_infos_number, 3350 db_hdr_vcf_header_infos_type, 3351 db_hdr_vcf_header_infos_description, 3352 db_hdr_vcf_header_infos_source, 3353 db_hdr_vcf_header_infos_version, 3354 self.code_type_map[ 3355 db_hdr_vcf_header_infos_type 3356 ], 3357 ) 3358 ) 3359 3360 annotation_list.append(annotation_field) 3361 3362 nb_annotation_field += 1 3363 3364 else: 3365 3366 if ( 3367 annotation_field 3368 not in db_hdr_vcf.get_header().infos 3369 ): 3370 log.warning( 3371 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3372 ) 3373 if ( 3374 annotation_fields_new_name 3375 in self.get_header().infos 3376 ): 3377 log.warning( 3378 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3379 ) 3380 3381 log.info( 3382 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3383 ) 3384 3385 annotation_infos = ",".join(annotation_list) 3386 3387 if annotation_infos != "": 3388 3389 # Annotated VCF (and error file) 3390 tmp_annotation_vcf_name = os.path.join( 3391 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3392 ) 3393 tmp_annotation_vcf_name_err = ( 3394 tmp_annotation_vcf_name + ".err" 3395 ) 3396 3397 # Add fields to annotate 3398 if not annotation_fields_full: 3399 annotation_infos_option = f"-info {annotation_infos}" 3400 else: 3401 annotation_infos_option = "" 3402 3403 # Info fields rename 3404 if annotation_infos_rename_list: 3405 annotation_infos_rename = " -c " + ",".join( 3406 annotation_infos_rename_list 3407 ) 3408 else: 3409 annotation_infos_rename = "" 3410 3411 # Annotate command 3412 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3413 3414 # Add command 3415 commands[command_annotate] = tmp_annotation_vcf_name 3416 3417 if commands: 3418 3419 # Export VCF file 3420 self.export_variant_vcf( 3421 vcf_file=tmp_vcf_name, 3422 remove_info=True, 3423 add_samples=False, 3424 index=True, 3425 ) 3426 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3427 3428 # Num command 3429 nb_command = 0 3430 3431 # Annotate 3432 for command_annotate in commands: 3433 nb_command += 1 3434 log.info( 3435 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3436 ) 3437 log.debug(f"command_annotate={command_annotate}") 3438 run_parallel_commands([command_annotate], threads) 3439 3440 # Debug 3441 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3442 3443 # Update variants 3444 log.info( 3445 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3446 ) 3447 self.update_from_vcf(commands[command_annotate]) 3448 3449 def annotation_bcftools(self, threads: int = None) -> None: 3450 """ 3451 This function annotate with bcftools 3452 3453 :param threads: Number of threads to use 3454 :return: the value of the variable "return_value". 3455 """ 3456 3457 # DEBUG 3458 log.debug("Start annotation with bcftools databases") 3459 3460 # Threads 3461 if not threads: 3462 threads = self.get_threads() 3463 log.debug("Threads: " + str(threads)) 3464 3465 # Config 3466 config = self.get_config() 3467 log.debug("Config: " + str(config)) 3468 3469 # DEBUG 3470 delete_tmp = True 3471 if self.get_config().get("verbosity", "warning") in ["debug"]: 3472 delete_tmp = False 3473 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3474 3475 # Config - BCFTools bin command 3476 bcftools_bin_command = get_bin_command( 3477 bin="bcftools", 3478 tool="bcftools", 3479 bin_type="bin", 3480 config=config, 3481 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3482 ) 3483 if not bcftools_bin_command: 3484 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Config - BCFTools databases folders 3489 databases_folders = set( 3490 self.get_config() 3491 .get("folders", {}) 3492 .get("databases", {}) 3493 .get("annotations", ["."]) 3494 + self.get_config() 3495 .get("folders", {}) 3496 .get("databases", {}) 3497 .get("bcftools", ["."]) 3498 ) 3499 log.debug("Databases annotations: " + str(databases_folders)) 3500 3501 # Param 3502 annotations = ( 3503 self.get_param() 3504 .get("annotation", {}) 3505 .get("bcftools", {}) 3506 .get("annotations", None) 3507 ) 3508 log.debug("Annotations: " + str(annotations)) 3509 3510 # Assembly 3511 assembly = self.get_param().get( 3512 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3513 ) 3514 3515 # Data 3516 table_variants = self.get_table_variants() 3517 3518 # Check if not empty 3519 log.debug("Check if not empty") 3520 sql_query_chromosomes = ( 3521 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3522 ) 3523 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3524 if not sql_query_chromosomes_df["count"][0]: 3525 log.info(f"VCF empty") 3526 return 3527 3528 # Export in VCF 3529 log.debug("Create initial file to annotate") 3530 tmp_vcf = NamedTemporaryFile( 3531 prefix=self.get_prefix(), 3532 dir=self.get_tmp_dir(), 3533 suffix=".vcf.gz", 3534 delete=False, 3535 ) 3536 tmp_vcf_name = tmp_vcf.name 3537 3538 # VCF header 3539 vcf_reader = self.get_header() 3540 log.debug("Initial header: " + str(vcf_reader.infos)) 3541 3542 # Existing annotations 3543 for vcf_annotation in self.get_header().infos: 3544 3545 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3546 log.debug( 3547 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3548 ) 3549 3550 if annotations: 3551 3552 tmp_ann_vcf_list = [] 3553 commands = [] 3554 tmp_files = [] 3555 err_files = [] 3556 3557 for annotation in annotations: 3558 annotation_fields = annotations[annotation] 3559 3560 # Annotation Name 3561 annotation_name = os.path.basename(annotation) 3562 3563 if not annotation_fields: 3564 annotation_fields = {"INFO": None} 3565 3566 log.debug(f"Annotation '{annotation_name}'") 3567 log.debug( 3568 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3569 ) 3570 3571 # Create Database 3572 database = Database( 3573 database=annotation, 3574 databases_folders=databases_folders, 3575 assembly=assembly, 3576 ) 3577 3578 # Find files 3579 db_file = database.get_database() 3580 db_file = full_path(db_file) 3581 db_hdr_file = database.get_header_file() 3582 db_hdr_file = full_path(db_hdr_file) 3583 db_file_type = database.get_format() 3584 db_tbi_file = f"{db_file}.tbi" 3585 db_file_compressed = database.is_compressed() 3586 3587 # Check if compressed 3588 if not db_file_compressed: 3589 log.error( 3590 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3591 ) 3592 raise ValueError( 3593 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3594 ) 3595 3596 # Check if indexed 3597 if not os.path.exists(db_tbi_file): 3598 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3599 raise ValueError( 3600 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3601 ) 3602 3603 # Check index - try to create if not exists 3604 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3605 log.error("Annotation failed: database not valid") 3606 log.error(f"Annotation annotation file: {db_file}") 3607 log.error(f"Annotation annotation header: {db_hdr_file}") 3608 log.error(f"Annotation annotation index: {db_tbi_file}") 3609 raise ValueError( 3610 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3611 ) 3612 else: 3613 3614 log.debug( 3615 f"Annotation '{annotation}' - file: " 3616 + str(db_file) 3617 + " and " 3618 + str(db_hdr_file) 3619 ) 3620 3621 # Load header as VCF object 3622 db_hdr_vcf = Variants(input=db_hdr_file) 3623 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3624 log.debug( 3625 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3626 ) 3627 3628 # For all fields in database 3629 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3630 annotation_fields = { 3631 key: key for key in db_hdr_vcf_header_infos 3632 } 3633 log.debug( 3634 "Annotation database header - All annotations added: " 3635 + str(annotation_fields) 3636 ) 3637 3638 # Number of fields 3639 nb_annotation_field = 0 3640 annotation_list = [] 3641 3642 for annotation_field in annotation_fields: 3643 3644 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3645 annotation_fields_new_name = annotation_fields.get( 3646 annotation_field, annotation_field 3647 ) 3648 if not annotation_fields_new_name: 3649 annotation_fields_new_name = annotation_field 3650 3651 # Check if field is in DB and if field is not elready in input data 3652 if ( 3653 annotation_field in db_hdr_vcf.get_header().infos 3654 and annotation_fields_new_name 3655 not in self.get_header().infos 3656 ): 3657 3658 log.info( 3659 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3660 ) 3661 3662 # Add INFO field to header 3663 db_hdr_vcf_header_infos_number = ( 3664 db_hdr_vcf_header_infos[annotation_field].num or "." 3665 ) 3666 db_hdr_vcf_header_infos_type = ( 3667 db_hdr_vcf_header_infos[annotation_field].type 3668 or "String" 3669 ) 3670 db_hdr_vcf_header_infos_description = ( 3671 db_hdr_vcf_header_infos[annotation_field].desc 3672 or f"{annotation_field} description" 3673 ) 3674 db_hdr_vcf_header_infos_source = ( 3675 db_hdr_vcf_header_infos[annotation_field].source 3676 or "unknown" 3677 ) 3678 db_hdr_vcf_header_infos_version = ( 3679 db_hdr_vcf_header_infos[annotation_field].version 3680 or "unknown" 3681 ) 3682 3683 vcf_reader.infos[annotation_fields_new_name] = ( 3684 vcf.parser._Info( 3685 annotation_fields_new_name, 3686 db_hdr_vcf_header_infos_number, 3687 db_hdr_vcf_header_infos_type, 3688 db_hdr_vcf_header_infos_description, 3689 db_hdr_vcf_header_infos_source, 3690 db_hdr_vcf_header_infos_version, 3691 self.code_type_map[db_hdr_vcf_header_infos_type], 3692 ) 3693 ) 3694 3695 # annotation_list.append(annotation_field) 3696 if annotation_field != annotation_fields_new_name: 3697 annotation_list.append( 3698 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3699 ) 3700 else: 3701 annotation_list.append(annotation_field) 3702 3703 nb_annotation_field += 1 3704 3705 else: 3706 3707 if annotation_field not in db_hdr_vcf.get_header().infos: 3708 log.warning( 3709 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3710 ) 3711 if annotation_fields_new_name in self.get_header().infos: 3712 log.warning( 3713 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3714 ) 3715 3716 log.info( 3717 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3718 ) 3719 3720 annotation_infos = ",".join(annotation_list) 3721 3722 if annotation_infos != "": 3723 3724 # Protect header for bcftools (remove "#CHROM" and variants line) 3725 log.debug("Protect Header file - remove #CHROM line if exists") 3726 tmp_header_vcf = NamedTemporaryFile( 3727 prefix=self.get_prefix(), 3728 dir=self.get_tmp_dir(), 3729 suffix=".hdr", 3730 delete=False, 3731 ) 3732 tmp_header_vcf_name = tmp_header_vcf.name 3733 tmp_files.append(tmp_header_vcf_name) 3734 # Command 3735 if db_hdr_file.endswith(".gz"): 3736 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3737 else: 3738 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3739 # Run 3740 run_parallel_commands([command_extract_header], 1) 3741 3742 # Find chomosomes 3743 log.debug("Find chromosomes ") 3744 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3745 sql_query_chromosomes_df = self.get_query_to_df( 3746 sql_query_chromosomes 3747 ) 3748 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3749 3750 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3751 3752 # BED columns in the annotation file 3753 if db_file_type in ["bed"]: 3754 annotation_infos = "CHROM,POS,POS," + annotation_infos 3755 3756 for chrom in chomosomes_list: 3757 3758 # Create BED on initial VCF 3759 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3760 tmp_bed = NamedTemporaryFile( 3761 prefix=self.get_prefix(), 3762 dir=self.get_tmp_dir(), 3763 suffix=".bed", 3764 delete=False, 3765 ) 3766 tmp_bed_name = tmp_bed.name 3767 tmp_files.append(tmp_bed_name) 3768 3769 # Detecte regions 3770 log.debug( 3771 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3772 ) 3773 window = 1000000 3774 sql_query_intervals_for_bed = f""" 3775 SELECT \"#CHROM\", 3776 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3777 \"POS\"+{window} 3778 FROM {table_variants} as table_variants 3779 WHERE table_variants.\"#CHROM\" = '{chrom}' 3780 """ 3781 regions = self.conn.execute( 3782 sql_query_intervals_for_bed 3783 ).fetchall() 3784 merged_regions = merge_regions(regions) 3785 log.debug( 3786 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3787 ) 3788 3789 header = ["#CHROM", "START", "END"] 3790 with open(tmp_bed_name, "w") as f: 3791 # Write the header with tab delimiter 3792 f.write("\t".join(header) + "\n") 3793 for d in merged_regions: 3794 # Write each data row with tab delimiter 3795 f.write("\t".join(map(str, d)) + "\n") 3796 3797 # Tmp files 3798 tmp_annotation_vcf = NamedTemporaryFile( 3799 prefix=self.get_prefix(), 3800 dir=self.get_tmp_dir(), 3801 suffix=".vcf.gz", 3802 delete=False, 3803 ) 3804 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3805 tmp_files.append(tmp_annotation_vcf_name) 3806 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3807 tmp_annotation_vcf_name_err = ( 3808 tmp_annotation_vcf_name + ".err" 3809 ) 3810 err_files.append(tmp_annotation_vcf_name_err) 3811 3812 # Annotate Command 3813 log.debug( 3814 f"Annotation '{annotation}' - add bcftools command" 3815 ) 3816 3817 # Command 3818 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3819 3820 # Add command 3821 commands.append(command_annotate) 3822 3823 # if some commands 3824 if commands: 3825 3826 # Export VCF file 3827 self.export_variant_vcf( 3828 vcf_file=tmp_vcf_name, 3829 remove_info=True, 3830 add_samples=False, 3831 index=True, 3832 ) 3833 3834 # Threads 3835 # calculate threads for annotated commands 3836 if commands: 3837 threads_bcftools_annotate = round(threads / len(commands)) 3838 else: 3839 threads_bcftools_annotate = 1 3840 3841 if not threads_bcftools_annotate: 3842 threads_bcftools_annotate = 1 3843 3844 # Add threads option to bcftools commands 3845 if threads_bcftools_annotate > 1: 3846 commands_threaded = [] 3847 for command in commands: 3848 commands_threaded.append( 3849 command.replace( 3850 f"{bcftools_bin_command} annotate ", 3851 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3852 ) 3853 ) 3854 commands = commands_threaded 3855 3856 # Command annotation multithreading 3857 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3858 log.info( 3859 f"Annotation - Annotation multithreaded in " 3860 + str(len(commands)) 3861 + " commands" 3862 ) 3863 3864 run_parallel_commands(commands, threads) 3865 3866 # Merge 3867 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3868 3869 if tmp_ann_vcf_list_cmd: 3870 3871 # Tmp file 3872 tmp_annotate_vcf = NamedTemporaryFile( 3873 prefix=self.get_prefix(), 3874 dir=self.get_tmp_dir(), 3875 suffix=".vcf.gz", 3876 delete=True, 3877 ) 3878 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3879 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3880 err_files.append(tmp_annotate_vcf_name_err) 3881 3882 # Tmp file remove command 3883 tmp_files_remove_command = "" 3884 if tmp_files: 3885 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3886 3887 # Command merge 3888 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3889 log.info( 3890 f"Annotation - Annotation merging " 3891 + str(len(commands)) 3892 + " annotated files" 3893 ) 3894 log.debug(f"Annotation - merge command: {merge_command}") 3895 run_parallel_commands([merge_command], 1) 3896 3897 # Error messages 3898 log.info(f"Error/Warning messages:") 3899 error_message_command_all = [] 3900 error_message_command_warning = [] 3901 error_message_command_err = [] 3902 for err_file in err_files: 3903 with open(err_file, "r") as f: 3904 for line in f: 3905 message = line.strip() 3906 error_message_command_all.append(message) 3907 if line.startswith("[W::"): 3908 error_message_command_warning.append(message) 3909 if line.startswith("[E::"): 3910 error_message_command_err.append( 3911 f"{err_file}: " + message 3912 ) 3913 # log info 3914 for message in list( 3915 set(error_message_command_err + error_message_command_warning) 3916 ): 3917 log.info(f" {message}") 3918 # debug info 3919 for message in list(set(error_message_command_all)): 3920 log.debug(f" {message}") 3921 # failed 3922 if len(error_message_command_err): 3923 log.error("Annotation failed: Error in commands") 3924 raise ValueError("Annotation failed: Error in commands") 3925 3926 # Update variants 3927 log.info(f"Annotation - Updating...") 3928 self.update_from_vcf(tmp_annotate_vcf_name) 3929 3930 def annotation_exomiser(self, threads: int = None) -> None: 3931 """ 3932 This function annotate with Exomiser 3933 3934 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3935 - "analysis" (dict/file): 3936 Full analysis dictionnary parameters (see Exomiser docs). 3937 Either a dict, or a file in JSON or YAML format. 3938 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3939 Default : None 3940 - "preset" (string): 3941 Analysis preset (available in config folder). 3942 Used if no full "analysis" is provided. 3943 Default: "exome" 3944 - "phenopacket" (dict/file): 3945 Samples and phenotipic features parameters (see Exomiser docs). 3946 Either a dict, or a file in JSON or YAML format. 3947 Default: None 3948 - "subject" (dict): 3949 Sample parameters (see Exomiser docs). 3950 Example: 3951 "subject": 3952 { 3953 "id": "ISDBM322017", 3954 "sex": "FEMALE" 3955 } 3956 Default: None 3957 - "sample" (string): 3958 Sample name to construct "subject" section: 3959 "subject": 3960 { 3961 "id": "<sample>", 3962 "sex": "UNKNOWN_SEX" 3963 } 3964 Default: None 3965 - "phenotypicFeatures" (dict) 3966 Phenotypic features to construct "subject" section. 3967 Example: 3968 "phenotypicFeatures": 3969 [ 3970 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3971 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3972 ] 3973 - "hpo" (list) 3974 List of HPO ids as phenotypic features. 3975 Example: 3976 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3977 Default: [] 3978 - "outputOptions" (dict): 3979 Output options (see Exomiser docs). 3980 Default: 3981 "output_options" = 3982 { 3983 "outputContributingVariantsOnly": False, 3984 "numGenes": 0, 3985 "outputFormats": ["TSV_VARIANT", "VCF"] 3986 } 3987 - "transcript_source" (string): 3988 Transcript source (either "refseq", "ucsc", "ensembl") 3989 Default: "refseq" 3990 - "exomiser_to_info" (boolean): 3991 Add exomiser TSV file columns as INFO fields in VCF. 3992 Default: False 3993 - "release" (string): 3994 Exomise database release. 3995 If not exists, database release will be downloaded (take a while). 3996 Default: None (provided by application.properties configuration file) 3997 - "exomiser_application_properties" (file): 3998 Exomiser configuration file (see Exomiser docs). 3999 Useful to automatically download databases (especially for specific genome databases). 4000 4001 Notes: 4002 - If no sample in parameters, first sample in VCF will be chosen 4003 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4004 4005 :param threads: The number of threads to use 4006 :return: None. 4007 """ 4008 4009 # DEBUG 4010 log.debug("Start annotation with Exomiser databases") 4011 4012 # Threads 4013 if not threads: 4014 threads = self.get_threads() 4015 log.debug("Threads: " + str(threads)) 4016 4017 # Config 4018 config = self.get_config() 4019 log.debug("Config: " + str(config)) 4020 4021 # Config - Folders - Databases 4022 databases_folders = ( 4023 config.get("folders", {}) 4024 .get("databases", {}) 4025 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4026 ) 4027 databases_folders = full_path(databases_folders) 4028 if not os.path.exists(databases_folders): 4029 log.error(f"Databases annotations: {databases_folders} NOT found") 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Config - Exomiser 4033 exomiser_bin_command = get_bin_command( 4034 bin="exomiser-cli*.jar", 4035 tool="exomiser", 4036 bin_type="jar", 4037 config=config, 4038 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4039 ) 4040 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4041 if not exomiser_bin_command: 4042 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4043 log.error(msg_err) 4044 raise ValueError(msg_err) 4045 4046 # Param 4047 param = self.get_param() 4048 log.debug("Param: " + str(param)) 4049 4050 # Param - Exomiser 4051 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4052 log.debug(f"Param Exomiser: {param_exomiser}") 4053 4054 # Param - Assembly 4055 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4056 log.debug("Assembly: " + str(assembly)) 4057 4058 # Data 4059 table_variants = self.get_table_variants() 4060 4061 # Check if not empty 4062 log.debug("Check if not empty") 4063 sql_query_chromosomes = ( 4064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4065 ) 4066 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4067 log.info(f"VCF empty") 4068 return False 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Samples 4075 samples = self.get_header_sample_list() 4076 if not samples: 4077 log.error("No Samples in VCF") 4078 return False 4079 log.debug(f"Samples: {samples}") 4080 4081 # Memory limit 4082 memory_limit = self.get_memory("8G") 4083 log.debug(f"memory_limit: {memory_limit}") 4084 4085 # Exomiser java options 4086 exomiser_java_options = ( 4087 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4088 ) 4089 log.debug(f"Exomiser java options: {exomiser_java_options}") 4090 4091 # Download Exomiser (if not exists) 4092 exomiser_release = param_exomiser.get("release", None) 4093 exomiser_application_properties = param_exomiser.get( 4094 "exomiser_application_properties", None 4095 ) 4096 databases_download_exomiser( 4097 assemblies=[assembly], 4098 exomiser_folder=databases_folders, 4099 exomiser_release=exomiser_release, 4100 exomiser_phenotype_release=exomiser_release, 4101 exomiser_application_properties=exomiser_application_properties, 4102 ) 4103 4104 # Force annotation 4105 force_update_annotation = True 4106 4107 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4108 log.debug("Start annotation Exomiser") 4109 4110 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4111 4112 # tmp_dir = "/tmp/exomiser" 4113 4114 ### ANALYSIS ### 4115 ################ 4116 4117 # Create analysis.json through analysis dict 4118 # either analysis in param or by default 4119 # depending on preset exome/genome) 4120 4121 # Init analysis dict 4122 param_exomiser_analysis_dict = {} 4123 4124 # analysis from param 4125 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4126 param_exomiser_analysis = full_path(param_exomiser_analysis) 4127 4128 # If analysis in param -> load anlaysis json 4129 if param_exomiser_analysis: 4130 4131 # If param analysis is a file and exists 4132 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4133 param_exomiser_analysis 4134 ): 4135 # Load analysis file into analysis dict (either yaml or json) 4136 with open(param_exomiser_analysis) as json_file: 4137 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4138 4139 # If param analysis is a dict 4140 elif isinstance(param_exomiser_analysis, dict): 4141 # Load analysis dict into analysis dict (either yaml or json) 4142 param_exomiser_analysis_dict = param_exomiser_analysis 4143 4144 # Error analysis type 4145 else: 4146 log.error(f"Analysis type unknown. Check param file.") 4147 raise ValueError(f"Analysis type unknown. Check param file.") 4148 4149 # Case no input analysis config file/dict 4150 # Use preset (exome/genome) to open default config file 4151 if not param_exomiser_analysis_dict: 4152 4153 # default preset 4154 default_preset = "exome" 4155 4156 # Get param preset or default preset 4157 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4158 4159 # Try to find if preset is a file 4160 if os.path.exists(param_exomiser_preset): 4161 # Preset file is provided in full path 4162 param_exomiser_analysis_default_config_file = ( 4163 param_exomiser_preset 4164 ) 4165 # elif os.path.exists(full_path(param_exomiser_preset)): 4166 # # Preset file is provided in full path 4167 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4168 elif os.path.exists( 4169 os.path.join(folder_config, param_exomiser_preset) 4170 ): 4171 # Preset file is provided a basename in config folder (can be a path with subfolders) 4172 param_exomiser_analysis_default_config_file = os.path.join( 4173 folder_config, param_exomiser_preset 4174 ) 4175 else: 4176 # Construct preset file 4177 param_exomiser_analysis_default_config_file = os.path.join( 4178 folder_config, 4179 f"preset-{param_exomiser_preset}-analysis.json", 4180 ) 4181 4182 # If preset file exists 4183 param_exomiser_analysis_default_config_file = full_path( 4184 param_exomiser_analysis_default_config_file 4185 ) 4186 if os.path.exists(param_exomiser_analysis_default_config_file): 4187 # Load prest file into analysis dict (either yaml or json) 4188 with open( 4189 param_exomiser_analysis_default_config_file 4190 ) as json_file: 4191 # param_exomiser_analysis_dict[""] = json.load(json_file) 4192 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4193 json_file 4194 ) 4195 4196 # Error preset file 4197 else: 4198 log.error( 4199 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4200 ) 4201 raise ValueError( 4202 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4203 ) 4204 4205 # If no analysis dict created 4206 if not param_exomiser_analysis_dict: 4207 log.error(f"No analysis config") 4208 raise ValueError(f"No analysis config") 4209 4210 # Log 4211 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4212 4213 ### PHENOPACKET ### 4214 ################### 4215 4216 # If no PhenoPacket in analysis dict -> check in param 4217 if "phenopacket" not in param_exomiser_analysis_dict: 4218 4219 # If PhenoPacket in param -> load anlaysis json 4220 if param_exomiser.get("phenopacket", None): 4221 4222 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4223 param_exomiser_phenopacket = full_path( 4224 param_exomiser_phenopacket 4225 ) 4226 4227 # If param phenopacket is a file and exists 4228 if isinstance( 4229 param_exomiser_phenopacket, str 4230 ) and os.path.exists(param_exomiser_phenopacket): 4231 # Load phenopacket file into analysis dict (either yaml or json) 4232 with open(param_exomiser_phenopacket) as json_file: 4233 param_exomiser_analysis_dict["phenopacket"] = ( 4234 yaml.safe_load(json_file) 4235 ) 4236 4237 # If param phenopacket is a dict 4238 elif isinstance(param_exomiser_phenopacket, dict): 4239 # Load phenopacket dict into analysis dict (either yaml or json) 4240 param_exomiser_analysis_dict["phenopacket"] = ( 4241 param_exomiser_phenopacket 4242 ) 4243 4244 # Error phenopacket type 4245 else: 4246 log.error(f"Phenopacket type unknown. Check param file.") 4247 raise ValueError( 4248 f"Phenopacket type unknown. Check param file." 4249 ) 4250 4251 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4252 if "phenopacket" not in param_exomiser_analysis_dict: 4253 4254 # Init PhenoPacket 4255 param_exomiser_analysis_dict["phenopacket"] = { 4256 "id": "analysis", 4257 "proband": {}, 4258 } 4259 4260 ### Add subject ### 4261 4262 # If subject exists 4263 param_exomiser_subject = param_exomiser.get("subject", {}) 4264 4265 # If subject not exists -> found sample ID 4266 if not param_exomiser_subject: 4267 4268 # Found sample ID in param 4269 sample = param_exomiser.get("sample", None) 4270 4271 # Find sample ID (first sample) 4272 if not sample: 4273 sample_list = self.get_header_sample_list() 4274 if len(sample_list) > 0: 4275 sample = sample_list[0] 4276 else: 4277 log.error(f"No sample found") 4278 raise ValueError(f"No sample found") 4279 4280 # Create subject 4281 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4282 4283 # Add to dict 4284 param_exomiser_analysis_dict["phenopacket"][ 4285 "subject" 4286 ] = param_exomiser_subject 4287 4288 ### Add "phenotypicFeatures" ### 4289 4290 # If phenotypicFeatures exists 4291 param_exomiser_phenotypicfeatures = param_exomiser.get( 4292 "phenotypicFeatures", [] 4293 ) 4294 4295 # If phenotypicFeatures not exists -> Try to infer from hpo list 4296 if not param_exomiser_phenotypicfeatures: 4297 4298 # Found HPO in param 4299 param_exomiser_hpo = param_exomiser.get("hpo", []) 4300 4301 # Split HPO if list in string format separated by comma 4302 if isinstance(param_exomiser_hpo, str): 4303 param_exomiser_hpo = param_exomiser_hpo.split(",") 4304 4305 # Create HPO list 4306 for hpo in param_exomiser_hpo: 4307 hpo_clean = re.sub("[^0-9]", "", hpo) 4308 param_exomiser_phenotypicfeatures.append( 4309 { 4310 "type": { 4311 "id": f"HP:{hpo_clean}", 4312 "label": f"HP:{hpo_clean}", 4313 } 4314 } 4315 ) 4316 4317 # Add to dict 4318 param_exomiser_analysis_dict["phenopacket"][ 4319 "phenotypicFeatures" 4320 ] = param_exomiser_phenotypicfeatures 4321 4322 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4323 if not param_exomiser_phenotypicfeatures: 4324 for step in param_exomiser_analysis_dict.get( 4325 "analysis", {} 4326 ).get("steps", []): 4327 if "hiPhivePrioritiser" in step: 4328 param_exomiser_analysis_dict.get("analysis", {}).get( 4329 "steps", [] 4330 ).remove(step) 4331 4332 ### Add Input File ### 4333 4334 # Initial file name and htsFiles 4335 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4336 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4337 { 4338 "uri": tmp_vcf_name, 4339 "htsFormat": "VCF", 4340 "genomeAssembly": assembly, 4341 } 4342 ] 4343 4344 ### Add metaData ### 4345 4346 # If metaData not in analysis dict 4347 if "metaData" not in param_exomiser_analysis_dict: 4348 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4349 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4350 "createdBy": "howard", 4351 "phenopacketSchemaVersion": 1, 4352 } 4353 4354 ### OutputOptions ### 4355 4356 # Init output result folder 4357 output_results = os.path.join(tmp_dir, "results") 4358 4359 # If no outputOptions in analysis dict 4360 if "outputOptions" not in param_exomiser_analysis_dict: 4361 4362 # default output formats 4363 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4364 4365 # Get outputOptions in param 4366 output_options = param_exomiser.get("outputOptions", None) 4367 4368 # If no output_options in param -> check 4369 if not output_options: 4370 output_options = { 4371 "outputContributingVariantsOnly": False, 4372 "numGenes": 0, 4373 "outputFormats": defaut_output_formats, 4374 } 4375 4376 # Replace outputDirectory in output options 4377 output_options["outputDirectory"] = output_results 4378 output_options["outputFileName"] = "howard" 4379 4380 # Add outputOptions in analysis dict 4381 param_exomiser_analysis_dict["outputOptions"] = output_options 4382 4383 else: 4384 4385 # Replace output_results and output format (if exists in param) 4386 param_exomiser_analysis_dict["outputOptions"][ 4387 "outputDirectory" 4388 ] = output_results 4389 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4390 list( 4391 set( 4392 param_exomiser_analysis_dict.get( 4393 "outputOptions", {} 4394 ).get("outputFormats", []) 4395 + ["TSV_VARIANT", "VCF"] 4396 ) 4397 ) 4398 ) 4399 4400 # log 4401 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4402 4403 ### ANALYSIS FILE ### 4404 ##################### 4405 4406 ### Full JSON analysis config file ### 4407 4408 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4409 with open(exomiser_analysis, "w") as fp: 4410 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4411 4412 ### SPLIT analysis and sample config files 4413 4414 # Splitted analysis dict 4415 param_exomiser_analysis_dict_for_split = ( 4416 param_exomiser_analysis_dict.copy() 4417 ) 4418 4419 # Phenopacket JSON file 4420 exomiser_analysis_phenopacket = os.path.join( 4421 tmp_dir, "analysis_phenopacket.json" 4422 ) 4423 with open(exomiser_analysis_phenopacket, "w") as fp: 4424 json.dump( 4425 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4426 fp, 4427 indent=4, 4428 ) 4429 4430 # Analysis JSON file without Phenopacket parameters 4431 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4432 exomiser_analysis_analysis = os.path.join( 4433 tmp_dir, "analysis_analysis.json" 4434 ) 4435 with open(exomiser_analysis_analysis, "w") as fp: 4436 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4437 4438 ### INITAL VCF file ### 4439 ####################### 4440 4441 ### Create list of samples to use and include inti initial VCF file #### 4442 4443 # Subject (main sample) 4444 # Get sample ID in analysis dict 4445 sample_subject = ( 4446 param_exomiser_analysis_dict.get("phenopacket", {}) 4447 .get("subject", {}) 4448 .get("id", None) 4449 ) 4450 sample_proband = ( 4451 param_exomiser_analysis_dict.get("phenopacket", {}) 4452 .get("proband", {}) 4453 .get("subject", {}) 4454 .get("id", None) 4455 ) 4456 sample = [] 4457 if sample_subject: 4458 sample.append(sample_subject) 4459 if sample_proband: 4460 sample.append(sample_proband) 4461 4462 # Get sample ID within Pedigree 4463 pedigree_persons_list = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("pedigree", {}) 4466 .get("persons", {}) 4467 ) 4468 4469 # Create list with all sample ID in pedigree (if exists) 4470 pedigree_persons = [] 4471 for person in pedigree_persons_list: 4472 pedigree_persons.append(person.get("individualId")) 4473 4474 # Concat subject sample ID and samples ID in pedigreesamples 4475 samples = list(set(sample + pedigree_persons)) 4476 4477 # Check if sample list is not empty 4478 if not samples: 4479 log.error(f"No samples found") 4480 raise ValueError(f"No samples found") 4481 4482 # Create VCF with sample (either sample in param or first one by default) 4483 # Export VCF file 4484 self.export_variant_vcf( 4485 vcf_file=tmp_vcf_name, 4486 remove_info=True, 4487 add_samples=True, 4488 list_samples=samples, 4489 index=False, 4490 ) 4491 4492 ### Execute Exomiser ### 4493 ######################## 4494 4495 # Init command 4496 exomiser_command = "" 4497 4498 # Command exomiser options 4499 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4500 4501 # Release 4502 exomiser_release = param_exomiser.get("release", None) 4503 if exomiser_release: 4504 # phenotype data version 4505 exomiser_options += ( 4506 f" --exomiser.phenotype.data-version={exomiser_release} " 4507 ) 4508 # data version 4509 exomiser_options += ( 4510 f" --exomiser.{assembly}.data-version={exomiser_release} " 4511 ) 4512 # variant white list 4513 variant_white_list_file = ( 4514 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4515 ) 4516 if os.path.exists( 4517 os.path.join( 4518 databases_folders, assembly, variant_white_list_file 4519 ) 4520 ): 4521 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4522 4523 # transcript_source 4524 transcript_source = param_exomiser.get( 4525 "transcript_source", None 4526 ) # ucsc, refseq, ensembl 4527 if transcript_source: 4528 exomiser_options += ( 4529 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4530 ) 4531 4532 # If analysis contain proband param 4533 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4534 "proband", {} 4535 ): 4536 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4537 4538 # If no proband (usually uniq sample) 4539 else: 4540 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4541 4542 # Log 4543 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4544 4545 # Run command 4546 result = subprocess.call( 4547 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4548 ) 4549 if result: 4550 log.error("Exomiser command failed") 4551 raise ValueError("Exomiser command failed") 4552 4553 ### RESULTS ### 4554 ############### 4555 4556 ### Annotate with TSV fields ### 4557 4558 # Init result tsv file 4559 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4560 4561 # Init result tsv file 4562 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4563 4564 # Parse TSV file and explode columns in INFO field 4565 if exomiser_to_info and os.path.exists(output_results_tsv): 4566 4567 # Log 4568 log.debug("Exomiser columns to VCF INFO field") 4569 4570 # Retrieve columns and types 4571 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4572 output_results_tsv_df = self.get_query_to_df(query) 4573 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4574 4575 # Init concat fields for update 4576 sql_query_update_concat_fields = [] 4577 4578 # Fields to avoid 4579 fields_to_avoid = [ 4580 "CONTIG", 4581 "START", 4582 "END", 4583 "REF", 4584 "ALT", 4585 "QUAL", 4586 "FILTER", 4587 "GENOTYPE", 4588 ] 4589 4590 # List all columns to add into header 4591 for header_column in output_results_tsv_columns: 4592 4593 # If header column is enable 4594 if header_column not in fields_to_avoid: 4595 4596 # Header info type 4597 header_info_type = "String" 4598 header_column_df = output_results_tsv_df[header_column] 4599 header_column_df_dtype = header_column_df.dtype 4600 if header_column_df_dtype == object: 4601 if ( 4602 pd.to_numeric(header_column_df, errors="coerce") 4603 .notnull() 4604 .all() 4605 ): 4606 header_info_type = "Float" 4607 else: 4608 header_info_type = "Integer" 4609 4610 # Header info 4611 characters_to_validate = ["-"] 4612 pattern = "[" + "".join(characters_to_validate) + "]" 4613 header_info_name = re.sub( 4614 pattern, 4615 "_", 4616 f"Exomiser_{header_column}".replace("#", ""), 4617 ) 4618 header_info_number = "." 4619 header_info_description = ( 4620 f"Exomiser {header_column} annotation" 4621 ) 4622 header_info_source = "Exomiser" 4623 header_info_version = "unknown" 4624 header_info_code = CODE_TYPE_MAP[header_info_type] 4625 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4626 header_info_name, 4627 header_info_number, 4628 header_info_type, 4629 header_info_description, 4630 header_info_source, 4631 header_info_version, 4632 header_info_code, 4633 ) 4634 4635 # Add field to add for update to concat fields 4636 sql_query_update_concat_fields.append( 4637 f""" 4638 CASE 4639 WHEN table_parquet."{header_column}" NOT IN ('','.') 4640 THEN concat( 4641 '{header_info_name}=', 4642 table_parquet."{header_column}", 4643 ';' 4644 ) 4645 4646 ELSE '' 4647 END 4648 """ 4649 ) 4650 4651 # Update query 4652 sql_query_update = f""" 4653 UPDATE {table_variants} as table_variants 4654 SET INFO = concat( 4655 CASE 4656 WHEN INFO NOT IN ('', '.') 4657 THEN INFO 4658 ELSE '' 4659 END, 4660 CASE 4661 WHEN table_variants.INFO NOT IN ('','.') 4662 THEN ';' 4663 ELSE '' 4664 END, 4665 ( 4666 SELECT 4667 concat( 4668 {",".join(sql_query_update_concat_fields)} 4669 ) 4670 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4671 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4672 AND table_parquet.\"START\" = table_variants.\"POS\" 4673 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4674 AND table_parquet.\"REF\" = table_variants.\"REF\" 4675 ) 4676 ) 4677 ; 4678 """ 4679 4680 # Update 4681 self.conn.execute(sql_query_update) 4682 4683 ### Annotate with VCF INFO field ### 4684 4685 # Init result VCF file 4686 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4687 4688 # If VCF exists 4689 if os.path.exists(output_results_vcf): 4690 4691 # Log 4692 log.debug("Exomiser result VCF update variants") 4693 4694 # Find Exomiser INFO field annotation in header 4695 with gzip.open(output_results_vcf, "rt") as f: 4696 header_list = self.read_vcf_header(f) 4697 exomiser_vcf_header = vcf.Reader( 4698 io.StringIO("\n".join(header_list)) 4699 ) 4700 4701 # Add annotation INFO field to header 4702 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4703 4704 # Update variants with VCF 4705 self.update_from_vcf(output_results_vcf) 4706 4707 return True 4708 4709 def annotation_snpeff(self, threads: int = None) -> None: 4710 """ 4711 This function annotate with snpEff 4712 4713 :param threads: The number of threads to use 4714 :return: the value of the variable "return_value". 4715 """ 4716 4717 # DEBUG 4718 log.debug("Start annotation with snpeff databases") 4719 4720 # Threads 4721 if not threads: 4722 threads = self.get_threads() 4723 log.debug("Threads: " + str(threads)) 4724 4725 # DEBUG 4726 delete_tmp = True 4727 if self.get_config().get("verbosity", "warning") in ["debug"]: 4728 delete_tmp = False 4729 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4730 4731 # Config 4732 config = self.get_config() 4733 log.debug("Config: " + str(config)) 4734 4735 # Config - Folders - Databases 4736 databases_folders = ( 4737 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4738 ) 4739 log.debug("Databases annotations: " + str(databases_folders)) 4740 4741 # # Config - Java 4742 # java_bin = get_bin( 4743 # tool="java", 4744 # bin="java", 4745 # bin_type="bin", 4746 # config=config, 4747 # default_folder="/usr/bin", 4748 # ) 4749 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4750 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4751 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4752 4753 # # Config - snpEff bin 4754 # snpeff_jar = get_bin( 4755 # tool="snpeff", 4756 # bin="snpEff.jar", 4757 # bin_type="jar", 4758 # config=config, 4759 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4760 # ) 4761 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4762 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4764 4765 # Config - snpEff bin command 4766 snpeff_bin_command = get_bin_command( 4767 bin="snpEff.jar", 4768 tool="snpeff", 4769 bin_type="jar", 4770 config=config, 4771 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 ) 4773 if not snpeff_bin_command: 4774 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4775 log.error(msg_err) 4776 raise ValueError(msg_err) 4777 4778 # Config - snpEff databases 4779 snpeff_databases = ( 4780 config.get("folders", {}) 4781 .get("databases", {}) 4782 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4783 ) 4784 snpeff_databases = full_path(snpeff_databases) 4785 if snpeff_databases is not None and snpeff_databases != "": 4786 log.debug(f"Create snpEff databases folder") 4787 if not os.path.exists(snpeff_databases): 4788 os.makedirs(snpeff_databases) 4789 4790 # Param 4791 param = self.get_param() 4792 log.debug("Param: " + str(param)) 4793 4794 # Param 4795 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4796 log.debug("Options: " + str(options)) 4797 4798 # Param - Assembly 4799 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4800 4801 # Param - Options 4802 snpeff_options = ( 4803 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4804 ) 4805 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4806 snpeff_csvstats = ( 4807 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4808 ) 4809 if snpeff_stats: 4810 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4811 snpeff_stats = full_path(snpeff_stats) 4812 snpeff_options += f" -stats {snpeff_stats}" 4813 if snpeff_csvstats: 4814 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4815 snpeff_csvstats = full_path(snpeff_csvstats) 4816 snpeff_options += f" -csvStats {snpeff_csvstats}" 4817 4818 # Data 4819 table_variants = self.get_table_variants() 4820 4821 # Check if not empty 4822 log.debug("Check if not empty") 4823 sql_query_chromosomes = ( 4824 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4825 ) 4826 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4827 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4828 log.info(f"VCF empty") 4829 return 4830 4831 # Export in VCF 4832 log.debug("Create initial file to annotate") 4833 tmp_vcf = NamedTemporaryFile( 4834 prefix=self.get_prefix(), 4835 dir=self.get_tmp_dir(), 4836 suffix=".vcf.gz", 4837 delete=True, 4838 ) 4839 tmp_vcf_name = tmp_vcf.name 4840 4841 # VCF header 4842 vcf_reader = self.get_header() 4843 log.debug("Initial header: " + str(vcf_reader.infos)) 4844 4845 # Existing annotations 4846 for vcf_annotation in self.get_header().infos: 4847 4848 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4849 log.debug( 4850 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4851 ) 4852 4853 # Memory limit 4854 # if config.get("memory", None): 4855 # memory_limit = config.get("memory", "8G") 4856 # else: 4857 # memory_limit = "8G" 4858 memory_limit = self.get_memory("8G") 4859 log.debug(f"memory_limit: {memory_limit}") 4860 4861 # snpEff java options 4862 snpeff_java_options = ( 4863 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4864 ) 4865 log.debug(f"Exomiser java options: {snpeff_java_options}") 4866 4867 force_update_annotation = True 4868 4869 if "ANN" not in self.get_header().infos or force_update_annotation: 4870 4871 # Check snpEff database 4872 log.debug(f"Check snpEff databases {[assembly]}") 4873 databases_download_snpeff( 4874 folder=snpeff_databases, assemblies=[assembly], config=config 4875 ) 4876 4877 # Export VCF file 4878 self.export_variant_vcf( 4879 vcf_file=tmp_vcf_name, 4880 remove_info=True, 4881 add_samples=False, 4882 index=True, 4883 ) 4884 4885 # Tmp file 4886 err_files = [] 4887 tmp_annotate_vcf = NamedTemporaryFile( 4888 prefix=self.get_prefix(), 4889 dir=self.get_tmp_dir(), 4890 suffix=".vcf", 4891 delete=False, 4892 ) 4893 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4894 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4895 err_files.append(tmp_annotate_vcf_name_err) 4896 4897 # Command 4898 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4899 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4900 run_parallel_commands([snpeff_command], 1) 4901 4902 # Error messages 4903 log.info(f"Error/Warning messages:") 4904 error_message_command_all = [] 4905 error_message_command_warning = [] 4906 error_message_command_err = [] 4907 for err_file in err_files: 4908 with open(err_file, "r") as f: 4909 for line in f: 4910 message = line.strip() 4911 error_message_command_all.append(message) 4912 if line.startswith("[W::"): 4913 error_message_command_warning.append(message) 4914 if line.startswith("[E::"): 4915 error_message_command_err.append(f"{err_file}: " + message) 4916 # log info 4917 for message in list( 4918 set(error_message_command_err + error_message_command_warning) 4919 ): 4920 log.info(f" {message}") 4921 # debug info 4922 for message in list(set(error_message_command_all)): 4923 log.debug(f" {message}") 4924 # failed 4925 if len(error_message_command_err): 4926 log.error("Annotation failed: Error in commands") 4927 raise ValueError("Annotation failed: Error in commands") 4928 4929 # Find annotation in header 4930 with open(tmp_annotate_vcf_name, "rt") as f: 4931 header_list = self.read_vcf_header(f) 4932 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4933 4934 for ann in annovar_vcf_header.infos: 4935 if ann not in self.get_header().infos: 4936 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4937 4938 # Update variants 4939 log.info(f"Annotation - Updating...") 4940 self.update_from_vcf(tmp_annotate_vcf_name) 4941 4942 else: 4943 if "ANN" in self.get_header().infos: 4944 log.debug(f"Existing snpEff annotations in VCF") 4945 if force_update_annotation: 4946 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4947 4948 def annotation_annovar(self, threads: int = None) -> None: 4949 """ 4950 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4951 annotations 4952 4953 :param threads: number of threads to use 4954 :return: the value of the variable "return_value". 4955 """ 4956 4957 # DEBUG 4958 log.debug("Start annotation with Annovar databases") 4959 4960 # Threads 4961 if not threads: 4962 threads = self.get_threads() 4963 log.debug("Threads: " + str(threads)) 4964 4965 # Tmp en Err files 4966 tmp_files = [] 4967 err_files = [] 4968 4969 # DEBUG 4970 delete_tmp = True 4971 if self.get_config().get("verbosity", "warning") in ["debug"]: 4972 delete_tmp = False 4973 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4974 4975 # Config 4976 config = self.get_config() 4977 log.debug("Config: " + str(config)) 4978 4979 # Config - Folders - Databases 4980 databases_folders = ( 4981 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4982 ) 4983 log.debug("Databases annotations: " + str(databases_folders)) 4984 4985 # Config - annovar bin command 4986 annovar_bin_command = get_bin_command( 4987 bin="table_annovar.pl", 4988 tool="annovar", 4989 bin_type="perl", 4990 config=config, 4991 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4992 ) 4993 if not annovar_bin_command: 4994 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4995 log.error(msg_err) 4996 raise ValueError(msg_err) 4997 4998 # Config - BCFTools bin command 4999 bcftools_bin_command = get_bin_command( 5000 bin="bcftools", 5001 tool="bcftools", 5002 bin_type="bin", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5005 ) 5006 if not bcftools_bin_command: 5007 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - annovar databases 5012 annovar_databases = ( 5013 config.get("folders", {}) 5014 .get("databases", {}) 5015 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5016 ) 5017 annovar_databases = full_path(annovar_databases) 5018 if annovar_databases != "" and not os.path.exists(annovar_databases): 5019 os.makedirs(annovar_databases) 5020 5021 # Param 5022 param = self.get_param() 5023 log.debug("Param: " + str(param)) 5024 5025 # Param - options 5026 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5027 log.debug("Options: " + str(options)) 5028 5029 # Param - annotations 5030 annotations = ( 5031 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5032 ) 5033 log.debug("Annotations: " + str(annotations)) 5034 5035 # Param - Assembly 5036 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5037 5038 # Annovar database assembly 5039 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5040 if annovar_databases_assembly != "" and not os.path.exists( 5041 annovar_databases_assembly 5042 ): 5043 os.makedirs(annovar_databases_assembly) 5044 5045 # Data 5046 table_variants = self.get_table_variants() 5047 5048 # Check if not empty 5049 log.debug("Check if not empty") 5050 sql_query_chromosomes = ( 5051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5052 ) 5053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5054 if not sql_query_chromosomes_df["count"][0]: 5055 log.info(f"VCF empty") 5056 return 5057 5058 # VCF header 5059 vcf_reader = self.get_header() 5060 log.debug("Initial header: " + str(vcf_reader.infos)) 5061 5062 # Existing annotations 5063 for vcf_annotation in self.get_header().infos: 5064 5065 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5066 log.debug( 5067 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5068 ) 5069 5070 force_update_annotation = True 5071 5072 if annotations: 5073 5074 commands = [] 5075 tmp_annotates_vcf_name_list = [] 5076 5077 # Export in VCF 5078 log.debug("Create initial file to annotate") 5079 tmp_vcf = NamedTemporaryFile( 5080 prefix=self.get_prefix(), 5081 dir=self.get_tmp_dir(), 5082 suffix=".vcf.gz", 5083 delete=False, 5084 ) 5085 tmp_vcf_name = tmp_vcf.name 5086 tmp_files.append(tmp_vcf_name) 5087 tmp_files.append(tmp_vcf_name + ".tbi") 5088 5089 # Export VCF file 5090 self.export_variant_vcf( 5091 vcf_file=tmp_vcf_name, 5092 remove_info=".", 5093 add_samples=False, 5094 index=True, 5095 ) 5096 5097 # Create file for field rename 5098 log.debug("Create file for field rename") 5099 tmp_rename = NamedTemporaryFile( 5100 prefix=self.get_prefix(), 5101 dir=self.get_tmp_dir(), 5102 suffix=".rename", 5103 delete=False, 5104 ) 5105 tmp_rename_name = tmp_rename.name 5106 tmp_files.append(tmp_rename_name) 5107 5108 # Check Annovar database 5109 log.debug( 5110 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5111 ) 5112 databases_download_annovar( 5113 folder=annovar_databases, 5114 files=list(annotations.keys()), 5115 assemblies=[assembly], 5116 ) 5117 5118 for annotation in annotations: 5119 annotation_fields = annotations[annotation] 5120 5121 if not annotation_fields: 5122 annotation_fields = {"INFO": None} 5123 5124 log.info(f"Annotations Annovar - database '{annotation}'") 5125 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5126 5127 # Tmp file for annovar 5128 err_files = [] 5129 tmp_annotate_vcf_directory = TemporaryDirectory( 5130 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5131 ) 5132 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5133 tmp_annotate_vcf_name_annovar = ( 5134 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5135 ) 5136 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5137 err_files.append(tmp_annotate_vcf_name_err) 5138 tmp_files.append(tmp_annotate_vcf_name_err) 5139 5140 # Tmp file final vcf annotated by annovar 5141 tmp_annotate_vcf = NamedTemporaryFile( 5142 prefix=self.get_prefix(), 5143 dir=self.get_tmp_dir(), 5144 suffix=".vcf.gz", 5145 delete=False, 5146 ) 5147 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5148 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name) 5150 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5151 5152 # Number of fields 5153 annotation_list = [] 5154 annotation_renamed_list = [] 5155 5156 for annotation_field in annotation_fields: 5157 5158 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5159 annotation_fields_new_name = annotation_fields.get( 5160 annotation_field, annotation_field 5161 ) 5162 if not annotation_fields_new_name: 5163 annotation_fields_new_name = annotation_field 5164 5165 if ( 5166 force_update_annotation 5167 or annotation_fields_new_name not in self.get_header().infos 5168 ): 5169 annotation_list.append(annotation_field) 5170 annotation_renamed_list.append(annotation_fields_new_name) 5171 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5172 log.warning( 5173 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5174 ) 5175 5176 # Add rename info 5177 run_parallel_commands( 5178 [ 5179 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5180 ], 5181 1, 5182 ) 5183 5184 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5185 log.debug("annotation_list: " + str(annotation_list)) 5186 5187 # protocol 5188 protocol = annotation 5189 5190 # argument 5191 argument = "" 5192 5193 # operation 5194 operation = "f" 5195 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5196 "ensGene" 5197 ): 5198 operation = "g" 5199 if options.get("genebase", None): 5200 argument = f"""'{options.get("genebase","")}'""" 5201 elif annotation in ["cytoBand"]: 5202 operation = "r" 5203 5204 # argument option 5205 argument_option = "" 5206 if argument != "": 5207 argument_option = " --argument " + argument 5208 5209 # command options 5210 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5211 for option in options: 5212 if option not in ["genebase"]: 5213 command_options += f""" --{option}={options[option]}""" 5214 5215 # Command 5216 5217 # Command - Annovar 5218 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5219 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5220 5221 # Command - start pipe 5222 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5223 5224 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5225 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5226 5227 # Command - Special characters (refGene annotation) 5228 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5229 5230 # Command - Clean empty fields (with value ".") 5231 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5232 5233 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5234 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5235 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5236 # for ann in annotation_renamed_list: 5237 for ann in annotation_list: 5238 annovar_fields_to_keep.append(f"^INFO/{ann}") 5239 5240 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5241 5242 # Command - indexing 5243 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5244 5245 log.debug(f"Annotation - Annovar command: {command_annovar}") 5246 run_parallel_commands([command_annovar], 1) 5247 5248 # Error messages 5249 log.info(f"Error/Warning messages:") 5250 error_message_command_all = [] 5251 error_message_command_warning = [] 5252 error_message_command_err = [] 5253 for err_file in err_files: 5254 with open(err_file, "r") as f: 5255 for line in f: 5256 message = line.strip() 5257 error_message_command_all.append(message) 5258 if line.startswith("[W::") or line.startswith("WARNING"): 5259 error_message_command_warning.append(message) 5260 if line.startswith("[E::") or line.startswith("ERROR"): 5261 error_message_command_err.append( 5262 f"{err_file}: " + message 5263 ) 5264 # log info 5265 for message in list( 5266 set(error_message_command_err + error_message_command_warning) 5267 ): 5268 log.info(f" {message}") 5269 # debug info 5270 for message in list(set(error_message_command_all)): 5271 log.debug(f" {message}") 5272 # failed 5273 if len(error_message_command_err): 5274 log.error("Annotation failed: Error in commands") 5275 raise ValueError("Annotation failed: Error in commands") 5276 5277 if tmp_annotates_vcf_name_list: 5278 5279 # List of annotated files 5280 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5281 5282 # Tmp file 5283 tmp_annotate_vcf = NamedTemporaryFile( 5284 prefix=self.get_prefix(), 5285 dir=self.get_tmp_dir(), 5286 suffix=".vcf.gz", 5287 delete=False, 5288 ) 5289 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5290 tmp_files.append(tmp_annotate_vcf_name) 5291 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5292 err_files.append(tmp_annotate_vcf_name_err) 5293 tmp_files.append(tmp_annotate_vcf_name_err) 5294 5295 # Command merge 5296 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5297 log.info( 5298 f"Annotation Annovar - Annotation merging " 5299 + str(len(tmp_annotates_vcf_name_list)) 5300 + " annotated files" 5301 ) 5302 log.debug(f"Annotation - merge command: {merge_command}") 5303 run_parallel_commands([merge_command], 1) 5304 5305 # Find annotation in header 5306 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5307 header_list = self.read_vcf_header(f) 5308 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5309 5310 for ann in annovar_vcf_header.infos: 5311 if ann not in self.get_header().infos: 5312 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5313 5314 # Update variants 5315 log.info(f"Annotation Annovar - Updating...") 5316 self.update_from_vcf(tmp_annotate_vcf_name) 5317 5318 # Clean files 5319 # Tmp file remove command 5320 if True: 5321 tmp_files_remove_command = "" 5322 if tmp_files: 5323 tmp_files_remove_command = " ".join(tmp_files) 5324 clean_command = f" rm -f {tmp_files_remove_command} " 5325 log.debug(f"Annotation Annovar - Annotation cleaning ") 5326 log.debug(f"Annotation - cleaning command: {clean_command}") 5327 run_parallel_commands([clean_command], 1) 5328 5329 # Parquet 5330 def annotation_parquet(self, threads: int = None) -> None: 5331 """ 5332 It takes a VCF file, and annotates it with a parquet file 5333 5334 :param threads: number of threads to use for the annotation 5335 :return: the value of the variable "result". 5336 """ 5337 5338 # DEBUG 5339 log.debug("Start annotation with parquet databases") 5340 5341 # Threads 5342 if not threads: 5343 threads = self.get_threads() 5344 log.debug("Threads: " + str(threads)) 5345 5346 # DEBUG 5347 delete_tmp = True 5348 if self.get_config().get("verbosity", "warning") in ["debug"]: 5349 delete_tmp = False 5350 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5351 5352 # Config 5353 databases_folders = set( 5354 self.get_config() 5355 .get("folders", {}) 5356 .get("databases", {}) 5357 .get("annotations", ["."]) 5358 + self.get_config() 5359 .get("folders", {}) 5360 .get("databases", {}) 5361 .get("parquet", ["."]) 5362 ) 5363 log.debug("Databases annotations: " + str(databases_folders)) 5364 5365 # Param 5366 annotations = ( 5367 self.get_param() 5368 .get("annotation", {}) 5369 .get("parquet", {}) 5370 .get("annotations", None) 5371 ) 5372 log.debug("Annotations: " + str(annotations)) 5373 5374 # Assembly 5375 assembly = self.get_param().get( 5376 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5377 ) 5378 5379 # Force Update Annotation 5380 force_update_annotation = ( 5381 self.get_param() 5382 .get("annotation", {}) 5383 .get("options", {}) 5384 .get("annotations_update", False) 5385 ) 5386 log.debug(f"force_update_annotation={force_update_annotation}") 5387 force_append_annotation = ( 5388 self.get_param() 5389 .get("annotation", {}) 5390 .get("options", {}) 5391 .get("annotations_append", False) 5392 ) 5393 log.debug(f"force_append_annotation={force_append_annotation}") 5394 5395 # Data 5396 table_variants = self.get_table_variants() 5397 5398 # Check if not empty 5399 log.debug("Check if not empty") 5400 sql_query_chromosomes_df = self.get_query_to_df( 5401 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5402 ) 5403 if not sql_query_chromosomes_df["count"][0]: 5404 log.info(f"VCF empty") 5405 return 5406 5407 # VCF header 5408 vcf_reader = self.get_header() 5409 log.debug("Initial header: " + str(vcf_reader.infos)) 5410 5411 # Nb Variants POS 5412 log.debug("NB Variants Start") 5413 nb_variants = self.conn.execute( 5414 f"SELECT count(*) AS count FROM variants" 5415 ).fetchdf()["count"][0] 5416 log.debug("NB Variants Stop") 5417 5418 # Existing annotations 5419 for vcf_annotation in self.get_header().infos: 5420 5421 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5422 log.debug( 5423 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5424 ) 5425 5426 # Added columns 5427 added_columns = [] 5428 5429 # drop indexes 5430 log.debug(f"Drop indexes...") 5431 self.drop_indexes() 5432 5433 if annotations: 5434 5435 if "ALL" in annotations: 5436 5437 all_param = annotations.get("ALL", {}) 5438 all_param_formats = all_param.get("formats", None) 5439 all_param_releases = all_param.get("releases", None) 5440 5441 databases_infos_dict = self.scan_databases( 5442 database_formats=all_param_formats, 5443 database_releases=all_param_releases, 5444 ) 5445 for database_infos in databases_infos_dict.keys(): 5446 if database_infos not in annotations: 5447 annotations[database_infos] = {"INFO": None} 5448 5449 for annotation in annotations: 5450 5451 if annotation in ["ALL"]: 5452 continue 5453 5454 # Annotation Name 5455 annotation_name = os.path.basename(annotation) 5456 5457 # Annotation fields 5458 annotation_fields = annotations[annotation] 5459 if not annotation_fields: 5460 annotation_fields = {"INFO": None} 5461 5462 log.debug(f"Annotation '{annotation_name}'") 5463 log.debug( 5464 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5465 ) 5466 5467 # Create Database 5468 database = Database( 5469 database=annotation, 5470 databases_folders=databases_folders, 5471 assembly=assembly, 5472 ) 5473 5474 # Find files 5475 parquet_file = database.get_database() 5476 parquet_hdr_file = database.get_header_file() 5477 parquet_type = database.get_type() 5478 5479 # Check if files exists 5480 if not parquet_file or not parquet_hdr_file: 5481 log.error("Annotation failed: file not found") 5482 raise ValueError("Annotation failed: file not found") 5483 else: 5484 # Get parquet connexion 5485 parquet_sql_attach = database.get_sql_database_attach( 5486 output="query" 5487 ) 5488 if parquet_sql_attach: 5489 self.conn.execute(parquet_sql_attach) 5490 parquet_file_link = database.get_sql_database_link() 5491 # Log 5492 log.debug( 5493 f"Annotation '{annotation_name}' - file: " 5494 + str(parquet_file) 5495 + " and " 5496 + str(parquet_hdr_file) 5497 ) 5498 5499 # Database full header columns 5500 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5501 parquet_hdr_file 5502 ) 5503 # Log 5504 log.debug( 5505 "Annotation database header columns : " 5506 + str(parquet_hdr_vcf_header_columns) 5507 ) 5508 5509 # Load header as VCF object 5510 parquet_hdr_vcf_header_infos = database.get_header().infos 5511 # Log 5512 log.debug( 5513 "Annotation database header: " 5514 + str(parquet_hdr_vcf_header_infos) 5515 ) 5516 5517 # Get extra infos 5518 parquet_columns = database.get_extra_columns() 5519 # Log 5520 log.debug("Annotation database Columns: " + str(parquet_columns)) 5521 5522 # Add extra columns if "ALL" in annotation_fields 5523 # if "ALL" in annotation_fields: 5524 # allow_add_extra_column = True 5525 if "ALL" in annotation_fields and database.get_extra_columns(): 5526 for extra_column in database.get_extra_columns(): 5527 if ( 5528 extra_column not in annotation_fields 5529 and extra_column.replace("INFO/", "") 5530 not in parquet_hdr_vcf_header_infos 5531 ): 5532 parquet_hdr_vcf_header_infos[extra_column] = ( 5533 vcf.parser._Info( 5534 extra_column, 5535 ".", 5536 "String", 5537 f"{extra_column} description", 5538 "unknown", 5539 "unknown", 5540 self.code_type_map["String"], 5541 ) 5542 ) 5543 5544 # For all fields in database 5545 annotation_fields_all = False 5546 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5547 annotation_fields_all = True 5548 annotation_fields = { 5549 key: key for key in parquet_hdr_vcf_header_infos 5550 } 5551 5552 log.debug( 5553 "Annotation database header - All annotations added: " 5554 + str(annotation_fields) 5555 ) 5556 5557 # Init 5558 5559 # List of annotation fields to use 5560 sql_query_annotation_update_info_sets = [] 5561 5562 # List of annotation to agregate 5563 sql_query_annotation_to_agregate = [] 5564 5565 # Number of fields 5566 nb_annotation_field = 0 5567 5568 # Annotation fields processed 5569 annotation_fields_processed = [] 5570 5571 # Columns mapping 5572 map_columns = database.map_columns( 5573 columns=annotation_fields, prefixes=["INFO/"] 5574 ) 5575 5576 # Query dict for fields to remove (update option) 5577 query_dict_remove = {} 5578 5579 # Fetch Anotation fields 5580 for annotation_field in annotation_fields: 5581 5582 # annotation_field_column 5583 annotation_field_column = map_columns.get( 5584 annotation_field, "INFO" 5585 ) 5586 5587 # field new name, if parametered 5588 annotation_fields_new_name = annotation_fields.get( 5589 annotation_field, annotation_field 5590 ) 5591 if not annotation_fields_new_name: 5592 annotation_fields_new_name = annotation_field 5593 5594 # To annotate 5595 # force_update_annotation = True 5596 # force_append_annotation = True 5597 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5598 if annotation_field in parquet_hdr_vcf_header_infos and ( 5599 force_update_annotation 5600 or force_append_annotation 5601 or ( 5602 annotation_fields_new_name 5603 not in self.get_header().infos 5604 ) 5605 ): 5606 5607 # Add field to annotation to process list 5608 annotation_fields_processed.append( 5609 annotation_fields_new_name 5610 ) 5611 5612 # explode infos for the field 5613 annotation_fields_new_name_info_msg = "" 5614 if ( 5615 force_update_annotation 5616 and annotation_fields_new_name 5617 in self.get_header().infos 5618 ): 5619 # Remove field from INFO 5620 query = f""" 5621 UPDATE {table_variants} as table_variants 5622 SET INFO = REGEXP_REPLACE( 5623 concat(table_variants.INFO,''), 5624 ';*{annotation_fields_new_name}=[^;]*', 5625 '' 5626 ) 5627 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5628 """ 5629 annotation_fields_new_name_info_msg = " [update]" 5630 query_dict_remove[ 5631 f"remove 'INFO/{annotation_fields_new_name}'" 5632 ] = query 5633 5634 # Sep between fields in INFO 5635 nb_annotation_field += 1 5636 if nb_annotation_field > 1: 5637 annotation_field_sep = ";" 5638 else: 5639 annotation_field_sep = "" 5640 5641 log.info( 5642 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5643 ) 5644 5645 # Add INFO field to header 5646 parquet_hdr_vcf_header_infos_number = ( 5647 parquet_hdr_vcf_header_infos[annotation_field].num 5648 or "." 5649 ) 5650 parquet_hdr_vcf_header_infos_type = ( 5651 parquet_hdr_vcf_header_infos[annotation_field].type 5652 or "String" 5653 ) 5654 parquet_hdr_vcf_header_infos_description = ( 5655 parquet_hdr_vcf_header_infos[annotation_field].desc 5656 or f"{annotation_field} description" 5657 ) 5658 parquet_hdr_vcf_header_infos_source = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].source 5660 or "unknown" 5661 ) 5662 parquet_hdr_vcf_header_infos_version = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].version 5664 or "unknown" 5665 ) 5666 5667 vcf_reader.infos[annotation_fields_new_name] = ( 5668 vcf.parser._Info( 5669 annotation_fields_new_name, 5670 parquet_hdr_vcf_header_infos_number, 5671 parquet_hdr_vcf_header_infos_type, 5672 parquet_hdr_vcf_header_infos_description, 5673 parquet_hdr_vcf_header_infos_source, 5674 parquet_hdr_vcf_header_infos_version, 5675 self.code_type_map[ 5676 parquet_hdr_vcf_header_infos_type 5677 ], 5678 ) 5679 ) 5680 5681 # Append 5682 if force_append_annotation: 5683 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5684 else: 5685 query_case_when_append = "" 5686 5687 # Annotation/Update query fields 5688 # Found in INFO column 5689 if ( 5690 annotation_field_column == "INFO" 5691 and "INFO" in parquet_hdr_vcf_header_columns 5692 ): 5693 sql_query_annotation_update_info_sets.append( 5694 f""" 5695 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5696 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5697 ELSE '' 5698 END 5699 """ 5700 ) 5701 # Found in a specific column 5702 else: 5703 sql_query_annotation_update_info_sets.append( 5704 f""" 5705 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5706 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5707 ELSE '' 5708 END 5709 """ 5710 ) 5711 sql_query_annotation_to_agregate.append( 5712 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5713 ) 5714 5715 # Not to annotate 5716 else: 5717 5718 if force_update_annotation: 5719 annotation_message = "forced" 5720 else: 5721 annotation_message = "skipped" 5722 5723 if annotation_field not in parquet_hdr_vcf_header_infos: 5724 log.warning( 5725 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5726 ) 5727 if annotation_fields_new_name in self.get_header().infos: 5728 log.warning( 5729 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5730 ) 5731 5732 # Check if ALL fields have to be annotated. Thus concat all INFO field 5733 # allow_annotation_full_info = True 5734 allow_annotation_full_info = not force_append_annotation 5735 5736 if parquet_type in ["regions"]: 5737 allow_annotation_full_info = False 5738 5739 if ( 5740 allow_annotation_full_info 5741 and nb_annotation_field == len(annotation_fields) 5742 and annotation_fields_all 5743 and ( 5744 "INFO" in parquet_hdr_vcf_header_columns 5745 and "INFO" in database.get_extra_columns() 5746 ) 5747 ): 5748 log.debug("Column INFO annotation enabled") 5749 sql_query_annotation_update_info_sets = [] 5750 sql_query_annotation_update_info_sets.append( 5751 f" table_parquet.INFO " 5752 ) 5753 5754 if sql_query_annotation_update_info_sets: 5755 5756 # Annotate 5757 log.info(f"Annotation '{annotation_name}' - Annotation...") 5758 5759 # Join query annotation update info sets for SQL 5760 sql_query_annotation_update_info_sets_sql = ",".join( 5761 sql_query_annotation_update_info_sets 5762 ) 5763 5764 # Check chromosomes list (and variants infos) 5765 sql_query_chromosomes = f""" 5766 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5767 FROM {table_variants} as table_variants 5768 GROUP BY table_variants."#CHROM" 5769 ORDER BY table_variants."#CHROM" 5770 """ 5771 sql_query_chromosomes_df = self.conn.execute( 5772 sql_query_chromosomes 5773 ).df() 5774 sql_query_chromosomes_dict = { 5775 entry["CHROM"]: { 5776 "count": entry["count_variants"], 5777 "min": entry["min_variants"], 5778 "max": entry["max_variants"], 5779 } 5780 for index, entry in sql_query_chromosomes_df.iterrows() 5781 } 5782 5783 # Init 5784 nb_of_query = 0 5785 nb_of_variant_annotated = 0 5786 query_dict = query_dict_remove 5787 5788 # for chrom in sql_query_chromosomes_df["CHROM"]: 5789 for chrom in sql_query_chromosomes_dict: 5790 5791 # Number of variant by chromosome 5792 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5793 chrom, {} 5794 ).get("count", 0) 5795 5796 log.debug( 5797 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5798 ) 5799 5800 # Annotation with regions database 5801 if parquet_type in ["regions"]: 5802 sql_query_annotation_from_clause = f""" 5803 FROM ( 5804 SELECT 5805 '{chrom}' AS \"#CHROM\", 5806 table_variants_from.\"POS\" AS \"POS\", 5807 {",".join(sql_query_annotation_to_agregate)} 5808 FROM {table_variants} as table_variants_from 5809 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5810 table_parquet_from."#CHROM" = '{chrom}' 5811 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5812 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5813 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5814 ) 5815 ) 5816 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5817 GROUP BY table_variants_from.\"POS\" 5818 ) 5819 as table_parquet 5820 """ 5821 5822 sql_query_annotation_where_clause = """ 5823 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5824 AND table_parquet.\"POS\" = table_variants.\"POS\" 5825 """ 5826 5827 # Annotation with variants database 5828 else: 5829 sql_query_annotation_from_clause = f""" 5830 FROM {parquet_file_link} as table_parquet 5831 """ 5832 sql_query_annotation_where_clause = f""" 5833 table_variants."#CHROM" = '{chrom}' 5834 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5835 AND table_parquet.\"POS\" = table_variants.\"POS\" 5836 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5837 AND table_parquet.\"REF\" = table_variants.\"REF\" 5838 """ 5839 5840 # Create update query 5841 sql_query_annotation_chrom_interval_pos = f""" 5842 UPDATE {table_variants} as table_variants 5843 SET INFO = 5844 concat( 5845 CASE WHEN table_variants.INFO NOT IN ('','.') 5846 THEN table_variants.INFO 5847 ELSE '' 5848 END 5849 , 5850 CASE WHEN table_variants.INFO NOT IN ('','.') 5851 AND ( 5852 concat({sql_query_annotation_update_info_sets_sql}) 5853 ) 5854 NOT IN ('','.') 5855 THEN ';' 5856 ELSE '' 5857 END 5858 , 5859 {sql_query_annotation_update_info_sets_sql} 5860 ) 5861 {sql_query_annotation_from_clause} 5862 WHERE {sql_query_annotation_where_clause} 5863 ; 5864 """ 5865 5866 # Add update query to dict 5867 query_dict[ 5868 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5869 ] = sql_query_annotation_chrom_interval_pos 5870 5871 nb_of_query = len(query_dict) 5872 num_query = 0 5873 5874 # SET max_expression_depth TO x 5875 self.conn.execute("SET max_expression_depth TO 10000") 5876 5877 for query_name in query_dict: 5878 query = query_dict[query_name] 5879 num_query += 1 5880 log.info( 5881 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5882 ) 5883 result = self.conn.execute(query) 5884 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5885 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5886 log.info( 5887 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5888 ) 5889 5890 log.info( 5891 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5892 ) 5893 5894 else: 5895 5896 log.info( 5897 f"Annotation '{annotation_name}' - No Annotations available" 5898 ) 5899 5900 log.debug("Final header: " + str(vcf_reader.infos)) 5901 5902 # Remove added columns 5903 for added_column in added_columns: 5904 self.drop_column(column=added_column) 5905 5906 def annotation_splice(self, threads: int = None) -> None: 5907 """ 5908 This function annotate with snpEff 5909 5910 :param threads: The number of threads to use 5911 :return: the value of the variable "return_value". 5912 """ 5913 5914 # DEBUG 5915 log.debug("Start annotation with splice tools") 5916 5917 # Threads 5918 if not threads: 5919 threads = self.get_threads() 5920 log.debug("Threads: " + str(threads)) 5921 5922 # DEBUG 5923 delete_tmp = True 5924 if self.get_config().get("verbosity", "warning") in ["debug"]: 5925 delete_tmp = False 5926 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5927 5928 # Config 5929 config = self.get_config() 5930 log.debug("Config: " + str(config)) 5931 splice_config = config.get("tools", {}).get("splice", {}) 5932 if not splice_config: 5933 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5934 if not splice_config: 5935 msg_err = "No Splice tool config" 5936 log.error(msg_err) 5937 raise ValueError(msg_err) 5938 log.debug(f"splice_config={splice_config}") 5939 5940 # Config - Folders - Databases 5941 databases_folders = ( 5942 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5943 ) 5944 log.debug("Databases annotations: " + str(databases_folders)) 5945 5946 # Splice docker image 5947 splice_docker_image = splice_config.get("docker").get("image") 5948 5949 # Pull splice image if it's not already there 5950 if not check_docker_image_exists(splice_docker_image): 5951 log.warning( 5952 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5953 ) 5954 try: 5955 command(f"docker pull {splice_config.get('docker').get('image')}") 5956 except subprocess.CalledProcessError: 5957 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5958 log.error(msg_err) 5959 raise ValueError(msg_err) 5960 return None 5961 5962 # Config - splice databases 5963 splice_databases = ( 5964 config.get("folders", {}) 5965 .get("databases", {}) 5966 .get("splice", DEFAULT_SPLICE_FOLDER) 5967 ) 5968 splice_databases = full_path(splice_databases) 5969 5970 # Param 5971 param = self.get_param() 5972 log.debug("Param: " + str(param)) 5973 5974 # Param 5975 options = param.get("annotation", {}).get("splice", {}) 5976 log.debug("Options: " + str(options)) 5977 5978 # Data 5979 table_variants = self.get_table_variants() 5980 5981 # Check if not empty 5982 log.debug("Check if not empty") 5983 sql_query_chromosomes = ( 5984 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5985 ) 5986 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5987 log.info("VCF empty") 5988 return None 5989 5990 # Export in VCF 5991 log.debug("Create initial file to annotate") 5992 5993 # Create output folder 5994 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5995 if not os.path.exists(output_folder): 5996 Path(output_folder).mkdir(parents=True, exist_ok=True) 5997 5998 # Create tmp VCF file 5999 tmp_vcf = NamedTemporaryFile( 6000 prefix=self.get_prefix(), 6001 dir=output_folder, 6002 suffix=".vcf", 6003 delete=False, 6004 ) 6005 tmp_vcf_name = tmp_vcf.name 6006 6007 # VCF header 6008 header = self.get_header() 6009 6010 # Existing annotations 6011 for vcf_annotation in self.get_header().infos: 6012 6013 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6014 log.debug( 6015 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6016 ) 6017 6018 # Memory limit 6019 if config.get("memory", None): 6020 memory_limit = config.get("memory", "8G").upper() 6021 # upper() 6022 else: 6023 memory_limit = "8G" 6024 log.debug(f"memory_limit: {memory_limit}") 6025 6026 # Check number of variants to annotate 6027 where_clause_regex_spliceai = r"SpliceAI_\w+" 6028 where_clause_regex_spip = r"SPiP_\w+" 6029 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6030 df_list_of_variants_to_annotate = self.get_query_to_df( 6031 query=f""" SELECT * FROM variants {where_clause} """ 6032 ) 6033 if len(df_list_of_variants_to_annotate) == 0: 6034 log.warning( 6035 f"No variants to annotate with splice. Variants probably already annotated with splice" 6036 ) 6037 return None 6038 else: 6039 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6040 6041 # Export VCF file 6042 self.export_variant_vcf( 6043 vcf_file=tmp_vcf_name, 6044 remove_info=True, 6045 add_samples=True, 6046 index=False, 6047 where_clause=where_clause, 6048 ) 6049 6050 # Create docker container and launch splice analysis 6051 if splice_config: 6052 6053 # Splice mount folders 6054 mount_folders = splice_config.get("mount", {}) 6055 6056 # Genome mount 6057 mount_folders[ 6058 config.get("folders", {}) 6059 .get("databases", {}) 6060 .get("genomes", DEFAULT_GENOME_FOLDER) 6061 ] = "ro" 6062 6063 # SpliceAI mount 6064 mount_folders[ 6065 config.get("folders", {}) 6066 .get("databases", {}) 6067 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6068 ] = "ro" 6069 6070 # Genome mount 6071 mount_folders[ 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spip", DEFAULT_SPIP_FOLDER) 6075 ] = "ro" 6076 6077 # Mount folders 6078 mount = [] 6079 6080 # Config mount 6081 mount = [ 6082 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6083 for path, mode in mount_folders.items() 6084 ] 6085 6086 if any(value for value in splice_config.values() if value is None): 6087 log.warning("At least one splice config parameter is empty") 6088 return None 6089 6090 # Params in splice nf 6091 def check_values(dico: dict): 6092 """ 6093 Ensure parameters for NF splice pipeline 6094 """ 6095 for key, val in dico.items(): 6096 if key == "genome": 6097 if any( 6098 assemb in options.get("genome", {}) 6099 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6100 ): 6101 yield f"--{key} hg19" 6102 elif any( 6103 assemb in options.get("genome", {}) 6104 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6105 ): 6106 yield f"--{key} hg38" 6107 elif ( 6108 (isinstance(val, str) and val) 6109 or isinstance(val, int) 6110 or isinstance(val, bool) 6111 ): 6112 yield f"--{key} {val}" 6113 6114 # Genome 6115 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6116 options["genome"] = genome 6117 6118 # NF params 6119 nf_params = [] 6120 6121 # Add options 6122 if options: 6123 nf_params = list(check_values(options)) 6124 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6125 else: 6126 log.debug("No NF params provided") 6127 6128 # Add threads 6129 if "threads" not in options.keys(): 6130 nf_params.append(f"--threads {threads}") 6131 6132 # Genome path 6133 genome_path = find_genome( 6134 config.get("folders", {}) 6135 .get("databases", {}) 6136 .get("genomes", DEFAULT_GENOME_FOLDER), 6137 file=f"{genome}.fa", 6138 ) 6139 # Add genome path 6140 if not genome_path: 6141 raise ValueError( 6142 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6143 ) 6144 else: 6145 log.debug(f"Genome: {genome_path}") 6146 nf_params.append(f"--genome_path {genome_path}") 6147 6148 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6149 """ 6150 Setting up updated databases for SPiP and SpliceAI 6151 """ 6152 6153 try: 6154 6155 # SpliceAI assembly transcriptome 6156 spliceai_assembly = os.path.join( 6157 config.get("folders", {}) 6158 .get("databases", {}) 6159 .get("spliceai", {}), 6160 options.get("genome"), 6161 "transcriptome", 6162 ) 6163 spip_assembly = options.get("genome") 6164 6165 spip = find( 6166 f"transcriptome_{spip_assembly}.RData", 6167 config.get("folders", {}).get("databases", {}).get("spip", {}), 6168 ) 6169 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6170 log.debug(f"SPiP annotations: {spip}") 6171 log.debug(f"SpliceAI annotations: {spliceai}") 6172 if spip and spliceai: 6173 return [ 6174 f"--spip_transcriptome {spip}", 6175 f"--spliceai_annotations {spliceai}", 6176 ] 6177 else: 6178 # TODO crash and go on with basic annotations ? 6179 # raise ValueError( 6180 # "Can't find splice databases in configuration EXIT" 6181 # ) 6182 log.warning( 6183 "Can't find splice databases in configuration, use annotations file from image" 6184 ) 6185 except TypeError: 6186 log.warning( 6187 "Can't find splice databases in configuration, use annotations file from image" 6188 ) 6189 return [] 6190 6191 # Add options, check if transcriptome option have already beend provided 6192 if ( 6193 "spip_transcriptome" not in nf_params 6194 and "spliceai_transcriptome" not in nf_params 6195 ): 6196 splice_reference = splice_annotations(options, config) 6197 if splice_reference: 6198 nf_params.extend(splice_reference) 6199 6200 nf_params.append(f"--output_folder {output_folder}") 6201 6202 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6203 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6204 log.debug(cmd) 6205 6206 splice_config["docker"]["command"] = cmd 6207 6208 docker_cmd = get_bin_command( 6209 tool="splice", 6210 bin_type="docker", 6211 config=config, 6212 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6213 add_options=f"--name {random_uuid} {' '.join(mount)}", 6214 ) 6215 6216 # Docker debug 6217 # if splice_config.get("rm_container"): 6218 # rm_container = "--rm" 6219 # else: 6220 # rm_container = "" 6221 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6222 6223 log.debug(docker_cmd) 6224 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6225 log.debug(res.stdout) 6226 if res.stderr: 6227 log.error(res.stderr) 6228 res.check_returncode() 6229 else: 6230 log.warning(f"Splice tool configuration not found: {config}") 6231 6232 # Update variants 6233 log.info("Annotation - Updating...") 6234 # Test find output vcf 6235 log.debug( 6236 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6237 ) 6238 output_vcf = [] 6239 # Wrong folder to look in 6240 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6241 if ( 6242 files 6243 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6244 ): 6245 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6246 # log.debug(os.listdir(options.get("output_folder"))) 6247 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6248 if not output_vcf: 6249 log.debug( 6250 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6251 ) 6252 else: 6253 # Get new header from annotated vcf 6254 log.debug(f"Initial header: {len(header.infos)} fields") 6255 # Create new header with splice infos 6256 new_vcf = Variants(input=output_vcf[0]) 6257 new_vcf_header = new_vcf.get_header().infos 6258 for keys, infos in new_vcf_header.items(): 6259 if keys not in header.infos.keys(): 6260 header.infos[keys] = infos 6261 log.debug(f"New header: {len(header.infos)} fields") 6262 log.debug(f"Splice tmp output: {output_vcf[0]}") 6263 self.update_from_vcf(output_vcf[0]) 6264 6265 # Remove folder 6266 remove_if_exists(output_folder) 6267 6268 ### 6269 # Prioritization 6270 ### 6271 6272 def get_config_default(self, name: str) -> dict: 6273 """ 6274 The function `get_config_default` returns a dictionary containing default configurations for 6275 various calculations and prioritizations. 6276 6277 :param name: The `get_config_default` function returns a dictionary containing default 6278 configurations for different calculations and prioritizations. The `name` parameter is used to 6279 specify which specific configuration to retrieve from the dictionary 6280 :type name: str 6281 :return: The function `get_config_default` returns a dictionary containing default configuration 6282 settings for different calculations and prioritizations. The specific configuration settings are 6283 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6284 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6285 returned. If there is no match, an empty dictionary is returned. 6286 """ 6287 6288 config_default = { 6289 "calculations": { 6290 "variant_chr_pos_alt_ref": { 6291 "type": "sql", 6292 "name": "variant_chr_pos_alt_ref", 6293 "description": "Create a variant ID with chromosome, position, alt and ref", 6294 "available": False, 6295 "output_column_name": "variant_chr_pos_alt_ref", 6296 "output_column_type": "String", 6297 "output_column_description": "variant ID with chromosome, position, alt and ref", 6298 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6299 "operation_info": True, 6300 }, 6301 "VARTYPE": { 6302 "type": "sql", 6303 "name": "VARTYPE", 6304 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6305 "available": True, 6306 "output_column_name": "VARTYPE", 6307 "output_column_type": "String", 6308 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6309 "operation_query": """ 6310 CASE 6311 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6312 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6313 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6314 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6315 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6316 ELSE 'UNDEFINED' 6317 END 6318 """, 6319 "info_fields": ["SVTYPE"], 6320 "operation_info": True, 6321 }, 6322 "snpeff_hgvs": { 6323 "type": "python", 6324 "name": "snpeff_hgvs", 6325 "description": "HGVS nomenclatures from snpEff annotation", 6326 "available": True, 6327 "function_name": "calculation_extract_snpeff_hgvs", 6328 "function_params": ["snpeff_hgvs", "ANN"], 6329 }, 6330 "snpeff_ann_explode": { 6331 "type": "python", 6332 "name": "snpeff_ann_explode", 6333 "description": "Explode snpEff annotations with uniquify values", 6334 "available": True, 6335 "function_name": "calculation_snpeff_ann_explode", 6336 "function_params": [False, "fields", "snpeff_", "ANN"], 6337 }, 6338 "snpeff_ann_explode_uniquify": { 6339 "type": "python", 6340 "name": "snpeff_ann_explode_uniquify", 6341 "description": "Explode snpEff annotations", 6342 "available": True, 6343 "function_name": "calculation_snpeff_ann_explode", 6344 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6345 }, 6346 "snpeff_ann_explode_json": { 6347 "type": "python", 6348 "name": "snpeff_ann_explode_json", 6349 "description": "Explode snpEff annotations in JSON format", 6350 "available": True, 6351 "function_name": "calculation_snpeff_ann_explode", 6352 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6353 }, 6354 "NOMEN": { 6355 "type": "python", 6356 "name": "NOMEN", 6357 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6358 "available": True, 6359 "function_name": "calculation_extract_nomen", 6360 "function_params": [], 6361 }, 6362 "FINDBYPIPELINE": { 6363 "type": "python", 6364 "name": "FINDBYPIPELINE", 6365 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6366 "available": True, 6367 "function_name": "calculation_find_by_pipeline", 6368 "function_params": ["findbypipeline"], 6369 }, 6370 "FINDBYSAMPLE": { 6371 "type": "python", 6372 "name": "FINDBYSAMPLE", 6373 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6374 "available": True, 6375 "function_name": "calculation_find_by_pipeline", 6376 "function_params": ["findbysample"], 6377 }, 6378 "GENOTYPECONCORDANCE": { 6379 "type": "python", 6380 "name": "GENOTYPECONCORDANCE", 6381 "description": "Concordance of genotype for multi caller VCF", 6382 "available": True, 6383 "function_name": "calculation_genotype_concordance", 6384 "function_params": [], 6385 }, 6386 "BARCODE": { 6387 "type": "python", 6388 "name": "BARCODE", 6389 "description": "BARCODE as VaRank tool", 6390 "available": True, 6391 "function_name": "calculation_barcode", 6392 "function_params": [], 6393 }, 6394 "BARCODEFAMILY": { 6395 "type": "python", 6396 "name": "BARCODEFAMILY", 6397 "description": "BARCODEFAMILY as VaRank tool", 6398 "available": True, 6399 "function_name": "calculation_barcode_family", 6400 "function_params": ["BCF"], 6401 }, 6402 "TRIO": { 6403 "type": "python", 6404 "name": "TRIO", 6405 "description": "Inheritance for a trio family", 6406 "available": True, 6407 "function_name": "calculation_trio", 6408 "function_params": [], 6409 }, 6410 "VAF": { 6411 "type": "python", 6412 "name": "VAF", 6413 "description": "Variant Allele Frequency (VAF) harmonization", 6414 "available": True, 6415 "function_name": "calculation_vaf_normalization", 6416 "function_params": [], 6417 }, 6418 "VAF_stats": { 6419 "type": "python", 6420 "name": "VAF_stats", 6421 "description": "Variant Allele Frequency (VAF) statistics", 6422 "available": True, 6423 "function_name": "calculation_genotype_stats", 6424 "function_params": ["VAF"], 6425 }, 6426 "DP_stats": { 6427 "type": "python", 6428 "name": "DP_stats", 6429 "description": "Depth (DP) statistics", 6430 "available": True, 6431 "function_name": "calculation_genotype_stats", 6432 "function_params": ["DP"], 6433 }, 6434 "variant_id": { 6435 "type": "python", 6436 "name": "variant_id", 6437 "description": "Variant ID generated from variant position and type", 6438 "available": True, 6439 "function_name": "calculation_variant_id", 6440 "function_params": [], 6441 }, 6442 "transcripts_json": { 6443 "type": "python", 6444 "name": "transcripts_json", 6445 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6446 "available": True, 6447 "function_name": "calculation_transcripts_json", 6448 "function_params": ["transcripts_json"], 6449 }, 6450 }, 6451 "prioritizations": { 6452 "default": { 6453 "filter": [ 6454 { 6455 "type": "notequals", 6456 "value": "!PASS|\\.", 6457 "score": 0, 6458 "flag": "FILTERED", 6459 "comment": ["Bad variant quality"], 6460 }, 6461 { 6462 "type": "equals", 6463 "value": "REJECT", 6464 "score": -20, 6465 "flag": "PASS", 6466 "comment": ["Bad variant quality"], 6467 }, 6468 ], 6469 "DP": [ 6470 { 6471 "type": "gte", 6472 "value": "50", 6473 "score": 5, 6474 "flag": "PASS", 6475 "comment": ["DP higher than 50"], 6476 } 6477 ], 6478 "ANN": [ 6479 { 6480 "type": "contains", 6481 "value": "HIGH", 6482 "score": 5, 6483 "flag": "PASS", 6484 "comment": [ 6485 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6486 ], 6487 }, 6488 { 6489 "type": "contains", 6490 "value": "MODERATE", 6491 "score": 3, 6492 "flag": "PASS", 6493 "comment": [ 6494 "A non-disruptive variant that might change protein effectiveness" 6495 ], 6496 }, 6497 { 6498 "type": "contains", 6499 "value": "LOW", 6500 "score": 0, 6501 "flag": "FILTERED", 6502 "comment": [ 6503 "Assumed to be mostly harmless or unlikely to change protein behavior" 6504 ], 6505 }, 6506 { 6507 "type": "contains", 6508 "value": "MODIFIER", 6509 "score": 0, 6510 "flag": "FILTERED", 6511 "comment": [ 6512 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6513 ], 6514 }, 6515 ], 6516 } 6517 }, 6518 } 6519 6520 return config_default.get(name, None) 6521 6522 def get_config_json( 6523 self, name: str, config_dict: dict = {}, config_file: str = None 6524 ) -> dict: 6525 """ 6526 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6527 default values, a dictionary, and a file. 6528 6529 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6530 the name of the configuration. It is used to identify and retrieve the configuration settings 6531 for a specific component or module 6532 :type name: str 6533 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6534 dictionary that allows you to provide additional configuration settings or overrides. When you 6535 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6536 the key is the configuration setting you want to override or 6537 :type config_dict: dict 6538 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6539 specify the path to a configuration file that contains additional settings. If provided, the 6540 function will read the contents of this file and update the configuration dictionary with the 6541 values found in the file, overriding any existing values with the 6542 :type config_file: str 6543 :return: The function `get_config_json` returns a dictionary containing the configuration 6544 settings. 6545 """ 6546 6547 # Create with default prioritizations 6548 config_default = self.get_config_default(name=name) 6549 configuration = config_default 6550 # log.debug(f"configuration={configuration}") 6551 6552 # Replace prioritizations from dict 6553 for config in config_dict: 6554 configuration[config] = config_dict[config] 6555 6556 # Replace prioritizations from file 6557 config_file = full_path(config_file) 6558 if config_file: 6559 if os.path.exists(config_file): 6560 with open(config_file) as config_file_content: 6561 config_file_dict = json.load(config_file_content) 6562 for config in config_file_dict: 6563 configuration[config] = config_file_dict[config] 6564 else: 6565 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6566 log.error(msg_error) 6567 raise ValueError(msg_error) 6568 6569 return configuration 6570 6571 def prioritization(self) -> None: 6572 """ 6573 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6574 INFO fields 6575 """ 6576 6577 # Config 6578 config = self.get_config() 6579 6580 # Param 6581 param = self.get_param() 6582 6583 # Quick Prioritizations 6584 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6585 6586 # Configuration profiles 6587 prioritization_config_file = param.get("prioritization", {}).get( 6588 "prioritization_config", None 6589 ) 6590 prioritization_config_file = full_path(prioritization_config_file) 6591 prioritizations_config = self.get_config_json( 6592 name="prioritizations", config_file=prioritization_config_file 6593 ) 6594 6595 # Prioritization options 6596 profiles = param.get("prioritization", {}).get("profiles", []) 6597 if isinstance(profiles, str): 6598 profiles = profiles.split(",") 6599 pzfields = param.get("prioritization", {}).get( 6600 "pzfields", ["PZFlag", "PZScore"] 6601 ) 6602 if isinstance(pzfields, str): 6603 pzfields = pzfields.split(",") 6604 default_profile = param.get("prioritization", {}).get("default_profile", None) 6605 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6606 prioritization_score_mode = param.get("prioritization", {}).get( 6607 "prioritization_score_mode", "HOWARD" 6608 ) 6609 6610 # Quick Prioritizations 6611 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6612 prioritizations = param.get("prioritizations", None) 6613 if prioritizations: 6614 log.info("Quick Prioritization:") 6615 for profile in prioritizations.split(","): 6616 if profile not in profiles: 6617 profiles.append(profile) 6618 log.info(f" {profile}") 6619 6620 # If profile "ALL" provided, all profiles in the config profiles 6621 if "ALL" in profiles: 6622 profiles = list(prioritizations_config.keys()) 6623 6624 for profile in profiles: 6625 if prioritizations_config.get(profile, None): 6626 log.debug(f"Profile '{profile}' configured") 6627 else: 6628 msg_error = f"Profile '{profile}' NOT configured" 6629 log.error(msg_error) 6630 raise ValueError(msg_error) 6631 6632 if profiles: 6633 log.info(f"Prioritization... ") 6634 else: 6635 log.debug(f"No profile defined") 6636 return 6637 6638 if not default_profile and len(profiles): 6639 default_profile = profiles[0] 6640 6641 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6642 log.debug("Profiles to check: " + str(list(profiles))) 6643 6644 # Variables 6645 table_variants = self.get_table_variants(clause="update") 6646 6647 # Added columns 6648 added_columns = [] 6649 6650 # Create list of PZfields 6651 # List of PZFields 6652 list_of_pzfields_original = pzfields + [ 6653 pzfield + pzfields_sep + profile 6654 for pzfield in pzfields 6655 for profile in profiles 6656 ] 6657 list_of_pzfields = [] 6658 log.debug(f"{list_of_pzfields_original}") 6659 6660 # Remove existing PZfields to use if exists 6661 for pzfield in list_of_pzfields_original: 6662 if self.get_header().infos.get(pzfield, None) is None: 6663 list_of_pzfields.append(pzfield) 6664 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6665 else: 6666 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6667 6668 if list_of_pzfields: 6669 6670 # Explode Infos fields 6671 explode_infos_prefix = self.get_explode_infos_prefix() 6672 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6673 extra_infos = self.get_extra_infos() 6674 6675 # PZfields tags description 6676 PZfields_INFOS = { 6677 "PZTags": { 6678 "ID": "PZTags", 6679 "Number": ".", 6680 "Type": "String", 6681 "Description": "Variant tags based on annotation criteria", 6682 }, 6683 "PZScore": { 6684 "ID": "PZScore", 6685 "Number": 1, 6686 "Type": "Integer", 6687 "Description": "Variant score based on annotation criteria", 6688 }, 6689 "PZFlag": { 6690 "ID": "PZFlag", 6691 "Number": 1, 6692 "Type": "String", 6693 "Description": "Variant flag based on annotation criteria", 6694 }, 6695 "PZComment": { 6696 "ID": "PZComment", 6697 "Number": ".", 6698 "Type": "String", 6699 "Description": "Variant comment based on annotation criteria", 6700 }, 6701 "PZInfos": { 6702 "ID": "PZInfos", 6703 "Number": ".", 6704 "Type": "String", 6705 "Description": "Variant infos based on annotation criteria", 6706 }, 6707 } 6708 6709 # Create INFO fields if not exist 6710 for field in PZfields_INFOS: 6711 field_ID = PZfields_INFOS[field]["ID"] 6712 field_description = PZfields_INFOS[field]["Description"] 6713 if field_ID not in self.get_header().infos and field_ID in pzfields: 6714 field_description = ( 6715 PZfields_INFOS[field]["Description"] 6716 + f", profile {default_profile}" 6717 ) 6718 self.get_header().infos[field_ID] = vcf.parser._Info( 6719 field_ID, 6720 PZfields_INFOS[field]["Number"], 6721 PZfields_INFOS[field]["Type"], 6722 field_description, 6723 "unknown", 6724 "unknown", 6725 code_type_map[PZfields_INFOS[field]["Type"]], 6726 ) 6727 6728 # Create INFO fields if not exist for each profile 6729 for profile in prioritizations_config: 6730 if profile in profiles or profiles == []: 6731 for field in PZfields_INFOS: 6732 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6733 field_description = ( 6734 PZfields_INFOS[field]["Description"] 6735 + f", profile {profile}" 6736 ) 6737 if ( 6738 field_ID not in self.get_header().infos 6739 and field in pzfields 6740 ): 6741 self.get_header().infos[field_ID] = vcf.parser._Info( 6742 field_ID, 6743 PZfields_INFOS[field]["Number"], 6744 PZfields_INFOS[field]["Type"], 6745 field_description, 6746 "unknown", 6747 "unknown", 6748 code_type_map[PZfields_INFOS[field]["Type"]], 6749 ) 6750 6751 # Header 6752 for pzfield in list_of_pzfields: 6753 if re.match("PZScore.*", pzfield): 6754 added_column = self.add_column( 6755 table_name=table_variants, 6756 column_name=pzfield, 6757 column_type="INTEGER", 6758 default_value="0", 6759 ) 6760 elif re.match("PZFlag.*", pzfield): 6761 added_column = self.add_column( 6762 table_name=table_variants, 6763 column_name=pzfield, 6764 column_type="BOOLEAN", 6765 default_value="1", 6766 ) 6767 else: 6768 added_column = self.add_column( 6769 table_name=table_variants, 6770 column_name=pzfield, 6771 column_type="STRING", 6772 default_value="''", 6773 ) 6774 added_columns.append(added_column) 6775 6776 # Profiles 6777 if profiles: 6778 6779 # foreach profile in configuration file 6780 for profile in prioritizations_config: 6781 6782 # If profile is asked in param, or ALL are asked (empty profile []) 6783 if profile in profiles or profiles == []: 6784 log.info(f"Profile '{profile}'") 6785 6786 sql_set_info_option = "" 6787 6788 sql_set_info = [] 6789 6790 # PZ fields set 6791 6792 # PZScore 6793 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6794 sql_set_info.append( 6795 f""" 6796 concat( 6797 'PZScore{pzfields_sep}{profile}=', 6798 PZScore{pzfields_sep}{profile} 6799 ) 6800 """ 6801 ) 6802 if ( 6803 profile == default_profile 6804 and "PZScore" in list_of_pzfields 6805 ): 6806 sql_set_info.append( 6807 f""" 6808 concat( 6809 'PZScore=', 6810 PZScore{pzfields_sep}{profile} 6811 ) 6812 """ 6813 ) 6814 6815 # PZFlag 6816 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6817 sql_set_info.append( 6818 f""" 6819 concat( 6820 'PZFlag{pzfields_sep}{profile}=', 6821 CASE 6822 WHEN PZFlag{pzfields_sep}{profile}==1 6823 THEN 'PASS' 6824 WHEN PZFlag{pzfields_sep}{profile}==0 6825 THEN 'FILTERED' 6826 END 6827 ) 6828 """ 6829 ) 6830 if ( 6831 profile == default_profile 6832 and "PZFlag" in list_of_pzfields 6833 ): 6834 sql_set_info.append( 6835 f""" 6836 concat( 6837 'PZFlag=', 6838 CASE 6839 WHEN PZFlag{pzfields_sep}{profile}==1 6840 THEN 'PASS' 6841 WHEN PZFlag{pzfields_sep}{profile}==0 6842 THEN 'FILTERED' 6843 END 6844 ) 6845 """ 6846 ) 6847 6848 # PZComment 6849 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6850 sql_set_info.append( 6851 f""" 6852 CASE 6853 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6854 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6855 ELSE '' 6856 END 6857 """ 6858 ) 6859 if ( 6860 profile == default_profile 6861 and "PZComment" in list_of_pzfields 6862 ): 6863 sql_set_info.append( 6864 f""" 6865 CASE 6866 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6867 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6868 ELSE '' 6869 END 6870 """ 6871 ) 6872 6873 # PZInfos 6874 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6875 sql_set_info.append( 6876 f""" 6877 CASE 6878 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6879 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6880 ELSE '' 6881 END 6882 """ 6883 ) 6884 if ( 6885 profile == default_profile 6886 and "PZInfos" in list_of_pzfields 6887 ): 6888 sql_set_info.append( 6889 f""" 6890 CASE 6891 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6892 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6893 ELSE '' 6894 END 6895 """ 6896 ) 6897 6898 # Merge PZfields 6899 sql_set_info_option = "" 6900 sql_set_sep = "" 6901 for sql_set in sql_set_info: 6902 if sql_set_sep: 6903 sql_set_info_option += f""" 6904 , concat('{sql_set_sep}', {sql_set}) 6905 """ 6906 else: 6907 sql_set_info_option += f""" 6908 , {sql_set} 6909 """ 6910 sql_set_sep = ";" 6911 6912 sql_queries = [] 6913 for annotation in prioritizations_config[profile]: 6914 6915 # Check if annotation field is present 6916 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6917 log.debug(f"Annotation '{annotation}' not in data") 6918 continue 6919 else: 6920 log.debug(f"Annotation '{annotation}' in data") 6921 6922 # For each criterions 6923 for criterion in prioritizations_config[profile][ 6924 annotation 6925 ]: 6926 criterion_type = criterion["type"] 6927 criterion_value = criterion["value"] 6928 criterion_score = criterion.get("score", 0) 6929 criterion_flag = criterion.get("flag", "PASS") 6930 criterion_flag_bool = criterion_flag == "PASS" 6931 criterion_comment = ( 6932 ", ".join(criterion.get("comment", [])) 6933 .replace("'", "''") 6934 .replace(";", ",") 6935 .replace("\t", " ") 6936 ) 6937 criterion_infos = ( 6938 str(criterion) 6939 .replace("'", "''") 6940 .replace(";", ",") 6941 .replace("\t", " ") 6942 ) 6943 6944 sql_set = [] 6945 sql_set_info = [] 6946 6947 # PZ fields set 6948 if ( 6949 f"PZScore{pzfields_sep}{profile}" 6950 in list_of_pzfields 6951 ): 6952 if prioritization_score_mode == "HOWARD": 6953 sql_set.append( 6954 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6955 ) 6956 elif prioritization_score_mode == "VaRank": 6957 sql_set.append( 6958 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6959 ) 6960 else: 6961 sql_set.append( 6962 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6963 ) 6964 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6965 sql_set.append( 6966 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6967 ) 6968 if ( 6969 f"PZComment{pzfields_sep}{profile}" 6970 in list_of_pzfields 6971 ): 6972 sql_set.append( 6973 f""" 6974 PZComment{pzfields_sep}{profile} = 6975 concat( 6976 PZComment{pzfields_sep}{profile}, 6977 CASE 6978 WHEN PZComment{pzfields_sep}{profile}!='' 6979 THEN ', ' 6980 ELSE '' 6981 END, 6982 '{criterion_comment}' 6983 ) 6984 """ 6985 ) 6986 if ( 6987 f"PZInfos{pzfields_sep}{profile}" 6988 in list_of_pzfields 6989 ): 6990 sql_set.append( 6991 f""" 6992 PZInfos{pzfields_sep}{profile} = 6993 concat( 6994 PZInfos{pzfields_sep}{profile}, 6995 '{criterion_infos}' 6996 ) 6997 """ 6998 ) 6999 sql_set_option = ",".join(sql_set) 7000 7001 # Criterion and comparison 7002 try: 7003 float(criterion_value) 7004 sql_update = f""" 7005 UPDATE {table_variants} 7006 SET {sql_set_option} 7007 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7008 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7009 """ 7010 except: 7011 contains_option = "" 7012 if criterion_type == "contains": 7013 contains_option = ".*" 7014 sql_update = f""" 7015 UPDATE {table_variants} 7016 SET {sql_set_option} 7017 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7018 """ 7019 sql_queries.append(sql_update) 7020 7021 # PZTags 7022 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7023 7024 # Create PZFalgs value 7025 pztags_value = "" 7026 pztags_sep_default = "|" 7027 pztags_sep = "" 7028 for pzfield in pzfields: 7029 if pzfield not in ["PZTags"]: 7030 if ( 7031 f"{pzfield}{pzfields_sep}{profile}" 7032 in list_of_pzfields 7033 ): 7034 if pzfield in ["PZFlag"]: 7035 pztags_value += f"""{pztags_sep}{pzfield}#', 7036 CASE WHEN PZFlag{pzfields_sep}{profile} 7037 THEN 'PASS' 7038 ELSE 'FILTERED' 7039 END, '""" 7040 else: 7041 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7042 pztags_sep = pztags_sep_default 7043 7044 # Add Query update for PZFlags 7045 sql_update_pztags = f""" 7046 UPDATE {table_variants} 7047 SET INFO = concat( 7048 INFO, 7049 CASE WHEN INFO NOT in ('','.') 7050 THEN ';' 7051 ELSE '' 7052 END, 7053 'PZTags{pzfields_sep}{profile}={pztags_value}' 7054 ) 7055 """ 7056 sql_queries.append(sql_update_pztags) 7057 7058 # Add Query update for PZFlags for default 7059 if profile == default_profile: 7060 sql_update_pztags_default = f""" 7061 UPDATE {table_variants} 7062 SET INFO = concat( 7063 INFO, 7064 ';', 7065 'PZTags={pztags_value}' 7066 ) 7067 """ 7068 sql_queries.append(sql_update_pztags_default) 7069 7070 log.info(f"""Profile '{profile}' - Prioritization... """) 7071 7072 if sql_queries: 7073 7074 for sql_query in sql_queries: 7075 log.debug( 7076 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7077 ) 7078 self.conn.execute(sql_query) 7079 7080 log.info(f"""Profile '{profile}' - Update... """) 7081 sql_query_update = f""" 7082 UPDATE {table_variants} 7083 SET INFO = 7084 concat( 7085 CASE 7086 WHEN INFO NOT IN ('','.') 7087 THEN concat(INFO, ';') 7088 ELSE '' 7089 END 7090 {sql_set_info_option} 7091 ) 7092 """ 7093 self.conn.execute(sql_query_update) 7094 7095 else: 7096 7097 log.warning(f"No profiles in parameters") 7098 7099 # Remove added columns 7100 for added_column in added_columns: 7101 self.drop_column(column=added_column) 7102 7103 # Explode INFOS fields into table fields 7104 if self.get_explode_infos(): 7105 self.explode_infos( 7106 prefix=self.get_explode_infos_prefix(), 7107 fields=self.get_explode_infos_fields(), 7108 force=True, 7109 ) 7110 7111 return 7112 7113 ### 7114 # HGVS 7115 ### 7116 7117 def annotation_hgvs(self, threads: int = None) -> None: 7118 """ 7119 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7120 coordinates and alleles. 7121 7122 :param threads: The `threads` parameter is an optional integer that specifies the number of 7123 threads to use for parallel processing. If no value is provided, it will default to the number 7124 of threads obtained from the `get_threads()` method 7125 :type threads: int 7126 """ 7127 7128 # Function for each partition of the Dask Dataframe 7129 def partition_function(partition): 7130 """ 7131 The function `partition_function` applies the `annotation_hgvs_partition` function to 7132 each row of a DataFrame called `partition`. 7133 7134 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7135 to be processed 7136 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7137 the "partition" dataframe along the axis 1. 7138 """ 7139 return partition.apply(annotation_hgvs_partition, axis=1) 7140 7141 def annotation_hgvs_partition(row) -> str: 7142 """ 7143 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7144 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7145 7146 :param row: A dictionary-like object that contains the values for the following keys: 7147 :return: a string that contains the HGVS names associated with the given row of data. 7148 """ 7149 7150 chr = row["CHROM"] 7151 pos = row["POS"] 7152 ref = row["REF"] 7153 alt = row["ALT"] 7154 7155 # Find list of associated transcripts 7156 transcripts_list = list( 7157 polars_conn.execute( 7158 f""" 7159 SELECT transcript 7160 FROM refseq_df 7161 WHERE CHROM='{chr}' 7162 AND POS={pos} 7163 """ 7164 )["transcript"] 7165 ) 7166 7167 # Full HGVS annotation in list 7168 hgvs_full_list = [] 7169 7170 for transcript_name in transcripts_list: 7171 7172 # Transcript 7173 transcript = get_transcript( 7174 transcripts=transcripts, transcript_name=transcript_name 7175 ) 7176 # Exon 7177 if use_exon: 7178 exon = transcript.find_exon_number(pos) 7179 else: 7180 exon = None 7181 # Protein 7182 transcript_protein = None 7183 if use_protein or add_protein or full_format: 7184 transcripts_protein = list( 7185 polars_conn.execute( 7186 f""" 7187 SELECT protein 7188 FROM refseqlink_df 7189 WHERE transcript='{transcript_name}' 7190 LIMIT 1 7191 """ 7192 )["protein"] 7193 ) 7194 if len(transcripts_protein): 7195 transcript_protein = transcripts_protein[0] 7196 7197 # HGVS name 7198 hgvs_name = format_hgvs_name( 7199 chr, 7200 pos, 7201 ref, 7202 alt, 7203 genome=genome, 7204 transcript=transcript, 7205 transcript_protein=transcript_protein, 7206 exon=exon, 7207 use_gene=use_gene, 7208 use_protein=use_protein, 7209 full_format=full_format, 7210 use_version=use_version, 7211 codon_type=codon_type, 7212 ) 7213 hgvs_full_list.append(hgvs_name) 7214 if add_protein and not use_protein and not full_format: 7215 hgvs_name = format_hgvs_name( 7216 chr, 7217 pos, 7218 ref, 7219 alt, 7220 genome=genome, 7221 transcript=transcript, 7222 transcript_protein=transcript_protein, 7223 exon=exon, 7224 use_gene=use_gene, 7225 use_protein=True, 7226 full_format=False, 7227 use_version=use_version, 7228 codon_type=codon_type, 7229 ) 7230 hgvs_full_list.append(hgvs_name) 7231 7232 # Create liste of HGVS annotations 7233 hgvs_full = ",".join(hgvs_full_list) 7234 7235 return hgvs_full 7236 7237 # Polars connexion 7238 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7239 7240 # Config 7241 config = self.get_config() 7242 7243 # Databases 7244 # Genome 7245 databases_genomes_folders = ( 7246 config.get("folders", {}) 7247 .get("databases", {}) 7248 .get("genomes", DEFAULT_GENOME_FOLDER) 7249 ) 7250 databases_genome = ( 7251 config.get("folders", {}).get("databases", {}).get("genomes", "") 7252 ) 7253 # refseq database folder 7254 databases_refseq_folders = ( 7255 config.get("folders", {}) 7256 .get("databases", {}) 7257 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7258 ) 7259 # refseq 7260 databases_refseq = config.get("databases", {}).get("refSeq", None) 7261 # refSeqLink 7262 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7263 7264 # Param 7265 param = self.get_param() 7266 7267 # Quick HGVS 7268 if "hgvs_options" in param and param.get("hgvs_options", ""): 7269 log.info(f"Quick HGVS Annotation:") 7270 if not param.get("hgvs", None): 7271 param["hgvs"] = {} 7272 for option in param.get("hgvs_options", "").split(","): 7273 option_var_val = option.split("=") 7274 option_var = option_var_val[0] 7275 if len(option_var_val) > 1: 7276 option_val = option_var_val[1] 7277 else: 7278 option_val = "True" 7279 if option_val.upper() in ["TRUE"]: 7280 option_val = True 7281 elif option_val.upper() in ["FALSE"]: 7282 option_val = False 7283 log.info(f" {option_var}={option_val}") 7284 param["hgvs"][option_var] = option_val 7285 7286 # Check if HGVS annotation enabled 7287 if "hgvs" in param: 7288 log.info(f"HGVS Annotation... ") 7289 for hgvs_option in param.get("hgvs", {}): 7290 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7291 else: 7292 return 7293 7294 # HGVS Param 7295 param_hgvs = param.get("hgvs", {}) 7296 use_exon = param_hgvs.get("use_exon", False) 7297 use_gene = param_hgvs.get("use_gene", False) 7298 use_protein = param_hgvs.get("use_protein", False) 7299 add_protein = param_hgvs.get("add_protein", False) 7300 full_format = param_hgvs.get("full_format", False) 7301 use_version = param_hgvs.get("use_version", False) 7302 codon_type = param_hgvs.get("codon_type", "3") 7303 7304 # refSseq refSeqLink 7305 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7306 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7307 7308 # Assembly 7309 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7310 7311 # Genome 7312 genome_file = None 7313 if find_genome(databases_genome): 7314 genome_file = find_genome(databases_genome) 7315 else: 7316 genome_file = find_genome( 7317 genome_path=databases_genomes_folders, assembly=assembly 7318 ) 7319 log.debug("Genome: " + str(genome_file)) 7320 7321 # refSseq 7322 refseq_file = find_file_prefix( 7323 input_file=databases_refseq, 7324 prefix="ncbiRefSeq", 7325 folder=databases_refseq_folders, 7326 assembly=assembly, 7327 ) 7328 log.debug("refSeq: " + str(refseq_file)) 7329 7330 # refSeqLink 7331 refseqlink_file = find_file_prefix( 7332 input_file=databases_refseqlink, 7333 prefix="ncbiRefSeqLink", 7334 folder=databases_refseq_folders, 7335 assembly=assembly, 7336 ) 7337 log.debug("refSeqLink: " + str(refseqlink_file)) 7338 7339 # Threads 7340 if not threads: 7341 threads = self.get_threads() 7342 log.debug("Threads: " + str(threads)) 7343 7344 # Variables 7345 table_variants = self.get_table_variants(clause="update") 7346 7347 # Get variants SNV and InDel only 7348 query_variants = f""" 7349 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7350 FROM {table_variants} 7351 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7352 """ 7353 df_variants = self.get_query_to_df(query_variants) 7354 7355 # Added columns 7356 added_columns = [] 7357 7358 # Add hgvs column in variants table 7359 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7360 added_column = self.add_column( 7361 table_variants, hgvs_column_name, "STRING", default_value=None 7362 ) 7363 added_columns.append(added_column) 7364 7365 log.debug(f"refSeq loading...") 7366 # refSeq in duckDB 7367 refseq_table = get_refseq_table( 7368 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7369 ) 7370 # Loading all refSeq in Dataframe 7371 refseq_query = f""" 7372 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7373 FROM {refseq_table} 7374 JOIN df_variants ON ( 7375 {refseq_table}.chrom = df_variants.CHROM 7376 AND {refseq_table}.txStart<=df_variants.POS 7377 AND {refseq_table}.txEnd>=df_variants.POS 7378 ) 7379 """ 7380 refseq_df = self.conn.query(refseq_query).pl() 7381 7382 if refseqlink_file: 7383 log.debug(f"refSeqLink loading...") 7384 # refSeqLink in duckDB 7385 refseqlink_table = get_refseq_table( 7386 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7387 ) 7388 # Loading all refSeqLink in Dataframe 7389 protacc_column = "protAcc_with_ver" 7390 mrnaacc_column = "mrnaAcc_with_ver" 7391 refseqlink_query = f""" 7392 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7393 FROM {refseqlink_table} 7394 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7395 WHERE protAcc_without_ver IS NOT NULL 7396 """ 7397 # Polars Dataframe 7398 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7399 7400 # Read RefSeq transcripts into a python dict/model. 7401 log.debug(f"Transcripts loading...") 7402 with tempfile.TemporaryDirectory() as tmpdir: 7403 transcripts_query = f""" 7404 COPY ( 7405 SELECT {refseq_table}.* 7406 FROM {refseq_table} 7407 JOIN df_variants ON ( 7408 {refseq_table}.chrom=df_variants.CHROM 7409 AND {refseq_table}.txStart<=df_variants.POS 7410 AND {refseq_table}.txEnd>=df_variants.POS 7411 ) 7412 ) 7413 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7414 """ 7415 self.conn.query(transcripts_query) 7416 with open(f"{tmpdir}/transcript.tsv") as infile: 7417 transcripts = read_transcripts(infile) 7418 7419 # Polars connexion 7420 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7421 7422 log.debug("Genome loading...") 7423 # Read genome sequence using pyfaidx. 7424 genome = Fasta(genome_file) 7425 7426 log.debug("Start annotation HGVS...") 7427 7428 # Create 7429 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7430 ddf = dd.from_pandas(df_variants, npartitions=threads) 7431 7432 # Use dask.dataframe.apply() to apply function on each partition 7433 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7434 7435 # Convert Dask DataFrame to Pandas Dataframe 7436 df = ddf.compute() 7437 7438 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7439 with tempfile.TemporaryDirectory() as tmpdir: 7440 df_parquet = os.path.join(tmpdir, "df.parquet") 7441 df.to_parquet(df_parquet) 7442 7443 # Update hgvs column 7444 update_variant_query = f""" 7445 UPDATE {table_variants} 7446 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7447 FROM read_parquet('{df_parquet}') as df 7448 WHERE variants."#CHROM" = df.CHROM 7449 AND variants.POS = df.POS 7450 AND variants.REF = df.REF 7451 AND variants.ALT = df.ALT 7452 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7453 """ 7454 self.execute_query(update_variant_query) 7455 7456 # Update INFO column 7457 sql_query_update = f""" 7458 UPDATE {table_variants} 7459 SET INFO = 7460 concat( 7461 CASE 7462 WHEN INFO NOT IN ('','.') 7463 THEN concat(INFO, ';') 7464 ELSE '' 7465 END, 7466 'hgvs=', 7467 {hgvs_column_name} 7468 ) 7469 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7470 """ 7471 self.execute_query(sql_query_update) 7472 7473 # Add header 7474 HGVS_INFOS = { 7475 "hgvs": { 7476 "ID": "hgvs", 7477 "Number": ".", 7478 "Type": "String", 7479 "Description": f"HGVS annotatation with HOWARD", 7480 } 7481 } 7482 7483 for field in HGVS_INFOS: 7484 field_ID = HGVS_INFOS[field]["ID"] 7485 field_description = HGVS_INFOS[field]["Description"] 7486 self.get_header().infos[field_ID] = vcf.parser._Info( 7487 field_ID, 7488 HGVS_INFOS[field]["Number"], 7489 HGVS_INFOS[field]["Type"], 7490 field_description, 7491 "unknown", 7492 "unknown", 7493 code_type_map[HGVS_INFOS[field]["Type"]], 7494 ) 7495 7496 # Remove added columns 7497 for added_column in added_columns: 7498 self.drop_column(column=added_column) 7499 7500 ### 7501 # Calculation 7502 ### 7503 7504 def get_operations_help( 7505 self, operations_config_dict: dict = {}, operations_config_file: str = None 7506 ) -> list: 7507 7508 # Init 7509 operations_help = [] 7510 7511 # operations 7512 operations = self.get_config_json( 7513 name="calculations", 7514 config_dict=operations_config_dict, 7515 config_file=operations_config_file, 7516 ) 7517 for op in operations: 7518 op_name = operations[op].get("name", op).upper() 7519 op_description = operations[op].get("description", op_name) 7520 op_available = operations[op].get("available", False) 7521 if op_available: 7522 operations_help.append(f" {op_name}: {op_description}") 7523 7524 # Sort operations 7525 operations_help.sort() 7526 7527 # insert header 7528 operations_help.insert(0, "Available calculation operations:") 7529 7530 # Return 7531 return operations_help 7532 7533 def calculation( 7534 self, 7535 operations: dict = {}, 7536 operations_config_dict: dict = {}, 7537 operations_config_file: str = None, 7538 ) -> None: 7539 """ 7540 It takes a list of operations, and for each operation, it checks if it's a python or sql 7541 operation, and then calls the appropriate function 7542 7543 param json example: 7544 "calculation": { 7545 "NOMEN": { 7546 "options": { 7547 "hgvs_field": "hgvs" 7548 }, 7549 "middle" : null 7550 } 7551 """ 7552 7553 # Param 7554 param = self.get_param() 7555 7556 # operations config 7557 operations_config = self.get_config_json( 7558 name="calculations", 7559 config_dict=operations_config_dict, 7560 config_file=operations_config_file, 7561 ) 7562 7563 # Upper keys 7564 operations_config = {k.upper(): v for k, v in operations_config.items()} 7565 7566 # Calculations 7567 7568 # Operations from param 7569 operations = param.get("calculation", {}).get("calculations", operations) 7570 7571 # Quick calculation - add 7572 if param.get("calculations", None): 7573 calculations_list = [ 7574 value for value in param.get("calculations", "").split(",") 7575 ] 7576 log.info(f"Quick Calculations:") 7577 for calculation_key in calculations_list: 7578 log.info(f" {calculation_key}") 7579 for calculation_operation in calculations_list: 7580 if calculation_operation.upper() not in operations: 7581 operations[calculation_operation.upper()] = {} 7582 add_value_into_dict( 7583 dict_tree=param, 7584 sections=[ 7585 "calculation", 7586 "calculations", 7587 calculation_operation.upper(), 7588 ], 7589 value={}, 7590 ) 7591 7592 # Operations for calculation 7593 if not operations: 7594 operations = param.get("calculation", {}).get("calculations", {}) 7595 7596 if operations: 7597 log.info(f"Calculations...") 7598 7599 # For each operations 7600 for operation_name in operations: 7601 operation_name = operation_name.upper() 7602 if operation_name not in [""]: 7603 if operation_name in operations_config: 7604 log.info(f"Calculation '{operation_name}'") 7605 operation = operations_config[operation_name] 7606 operation_type = operation.get("type", "sql") 7607 if operation_type == "python": 7608 self.calculation_process_function( 7609 operation=operation, operation_name=operation_name 7610 ) 7611 elif operation_type == "sql": 7612 self.calculation_process_sql( 7613 operation=operation, operation_name=operation_name 7614 ) 7615 else: 7616 log.error( 7617 f"Operations config: Type '{operation_type}' NOT available" 7618 ) 7619 raise ValueError( 7620 f"Operations config: Type '{operation_type}' NOT available" 7621 ) 7622 else: 7623 log.error( 7624 f"Operations config: Calculation '{operation_name}' NOT available" 7625 ) 7626 raise ValueError( 7627 f"Operations config: Calculation '{operation_name}' NOT available" 7628 ) 7629 7630 # Explode INFOS fields into table fields 7631 if self.get_explode_infos(): 7632 self.explode_infos( 7633 prefix=self.get_explode_infos_prefix(), 7634 fields=self.get_explode_infos_fields(), 7635 force=True, 7636 ) 7637 7638 def calculation_process_sql( 7639 self, operation: dict, operation_name: str = "unknown" 7640 ) -> None: 7641 """ 7642 The `calculation_process_sql` function takes in a mathematical operation as a string and 7643 performs the operation, updating the specified table with the result. 7644 7645 :param operation: The `operation` parameter is a dictionary that contains information about the 7646 mathematical operation to be performed. It includes the following keys: 7647 :type operation: dict 7648 :param operation_name: The `operation_name` parameter is a string that represents the name of 7649 the mathematical operation being performed. It is used for logging and error handling purposes, 7650 defaults to unknown 7651 :type operation_name: str (optional) 7652 """ 7653 7654 # table variants 7655 table_variants = self.get_table_variants(clause="alter") 7656 7657 # Operation infos 7658 operation_name = operation.get("name", "unknown") 7659 log.debug(f"process sql {operation_name}") 7660 output_column_name = operation.get("output_column_name", operation_name) 7661 output_column_type = operation.get("output_column_type", "String") 7662 prefix = operation.get("explode_infos_prefix", "") 7663 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7664 output_column_description = operation.get( 7665 "output_column_description", f"{operation_name} operation" 7666 ) 7667 operation_query = operation.get("operation_query", None) 7668 if isinstance(operation_query, list): 7669 operation_query = " ".join(operation_query) 7670 operation_info_fields = operation.get("info_fields", []) 7671 operation_info_fields_check = operation.get("info_fields_check", False) 7672 operation_info = operation.get("operation_info", True) 7673 7674 if operation_query: 7675 7676 # Info fields check 7677 operation_info_fields_check_result = True 7678 if operation_info_fields_check: 7679 header_infos = self.get_header().infos 7680 for info_field in operation_info_fields: 7681 operation_info_fields_check_result = ( 7682 operation_info_fields_check_result 7683 and info_field in header_infos 7684 ) 7685 7686 # If info fields available 7687 if operation_info_fields_check_result: 7688 7689 # Added_columns 7690 added_columns = [] 7691 7692 # Create VCF header field 7693 vcf_reader = self.get_header() 7694 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7695 output_column_name, 7696 ".", 7697 output_column_type, 7698 output_column_description, 7699 "howard calculation", 7700 "0", 7701 self.code_type_map.get(output_column_type), 7702 ) 7703 7704 # Explode infos if needed 7705 log.debug(f"calculation_process_sql prefix {prefix}") 7706 added_columns += self.explode_infos( 7707 prefix=prefix, 7708 fields=[output_column_name] + operation_info_fields, 7709 force=True, 7710 ) 7711 7712 # Create column 7713 added_column = self.add_column( 7714 table_name=table_variants, 7715 column_name=prefix + output_column_name, 7716 column_type=output_column_type_sql, 7717 default_value="null", 7718 ) 7719 added_columns.append(added_column) 7720 7721 # Operation calculation 7722 try: 7723 7724 # Query to update calculation column 7725 sql_update = f""" 7726 UPDATE {table_variants} 7727 SET "{prefix}{output_column_name}" = ({operation_query}) 7728 """ 7729 self.conn.execute(sql_update) 7730 7731 # Add to INFO 7732 if operation_info: 7733 sql_update_info = f""" 7734 UPDATE {table_variants} 7735 SET "INFO" = 7736 concat( 7737 CASE 7738 WHEN "INFO" IS NOT NULL 7739 THEN concat("INFO", ';') 7740 ELSE '' 7741 END, 7742 '{output_column_name}=', 7743 "{prefix}{output_column_name}" 7744 ) 7745 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7746 """ 7747 self.conn.execute(sql_update_info) 7748 7749 except: 7750 log.error( 7751 f"Operations config: Calculation '{operation_name}' query failed" 7752 ) 7753 raise ValueError( 7754 f"Operations config: Calculation '{operation_name}' query failed" 7755 ) 7756 7757 # Remove added columns 7758 for added_column in added_columns: 7759 log.debug(f"added_column: {added_column}") 7760 self.drop_column(column=added_column) 7761 7762 else: 7763 log.error( 7764 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7765 ) 7766 raise ValueError( 7767 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7768 ) 7769 7770 else: 7771 log.error( 7772 f"Operations config: Calculation '{operation_name}' query NOT defined" 7773 ) 7774 raise ValueError( 7775 f"Operations config: Calculation '{operation_name}' query NOT defined" 7776 ) 7777 7778 def calculation_process_function( 7779 self, operation: dict, operation_name: str = "unknown" 7780 ) -> None: 7781 """ 7782 The `calculation_process_function` takes in an operation dictionary and performs the specified 7783 function with the given parameters. 7784 7785 :param operation: The `operation` parameter is a dictionary that contains information about the 7786 operation to be performed. It has the following keys: 7787 :type operation: dict 7788 :param operation_name: The `operation_name` parameter is a string that represents the name of 7789 the operation being performed. It is used for logging purposes, defaults to unknown 7790 :type operation_name: str (optional) 7791 """ 7792 7793 operation_name = operation["name"] 7794 log.debug(f"process sql {operation_name}") 7795 function_name = operation["function_name"] 7796 function_params = operation["function_params"] 7797 getattr(self, function_name)(*function_params) 7798 7799 def calculation_variant_id(self) -> None: 7800 """ 7801 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7802 updates the INFO field of a variants table with the variant ID. 7803 """ 7804 7805 # variant_id annotation field 7806 variant_id_tag = self.get_variant_id_column() 7807 added_columns = [variant_id_tag] 7808 7809 # variant_id hgvs tags" 7810 vcf_infos_tags = { 7811 variant_id_tag: "howard variant ID annotation", 7812 } 7813 7814 # Variants table 7815 table_variants = self.get_table_variants() 7816 7817 # Header 7818 vcf_reader = self.get_header() 7819 7820 # Add variant_id to header 7821 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7822 variant_id_tag, 7823 ".", 7824 "String", 7825 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7826 "howard calculation", 7827 "0", 7828 self.code_type_map.get("String"), 7829 ) 7830 7831 # Update 7832 sql_update = f""" 7833 UPDATE {table_variants} 7834 SET "INFO" = 7835 concat( 7836 CASE 7837 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7838 THEN '' 7839 ELSE concat("INFO", ';') 7840 END, 7841 '{variant_id_tag}=', 7842 "{variant_id_tag}" 7843 ) 7844 """ 7845 self.conn.execute(sql_update) 7846 7847 # Remove added columns 7848 for added_column in added_columns: 7849 self.drop_column(column=added_column) 7850 7851 def calculation_extract_snpeff_hgvs( 7852 self, 7853 snpeff_hgvs: str = "snpeff_hgvs", 7854 snpeff_field: str = "ANN", 7855 ) -> None: 7856 """ 7857 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7858 annotation field in a VCF file and adds them as a new column in the variants table. 7859 7860 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7861 function is used to specify the name of the column that will store the HGVS nomenclatures 7862 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7863 snpeff_hgvs 7864 :type snpeff_hgvs: str (optional) 7865 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7866 function represents the field in the VCF file that contains SnpEff annotations. This field is 7867 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7868 to ANN 7869 :type snpeff_field: str (optional) 7870 """ 7871 7872 # Snpeff hgvs tags 7873 vcf_infos_tags = { 7874 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7875 } 7876 7877 # Prefix 7878 prefix = self.get_explode_infos_prefix() 7879 if prefix: 7880 prefix = "INFO/" 7881 7882 # snpEff fields 7883 speff_ann_infos = prefix + snpeff_field 7884 speff_hgvs_infos = prefix + snpeff_hgvs 7885 7886 # Variants table 7887 table_variants = self.get_table_variants() 7888 7889 # Header 7890 vcf_reader = self.get_header() 7891 7892 # Add columns 7893 added_columns = [] 7894 7895 # Explode HGVS field in column 7896 added_columns += self.explode_infos(fields=[snpeff_field]) 7897 7898 if snpeff_field in vcf_reader.infos: 7899 7900 log.debug(vcf_reader.infos[snpeff_field]) 7901 7902 # Extract ANN header 7903 ann_description = vcf_reader.infos[snpeff_field].desc 7904 pattern = r"'(.+?)'" 7905 match = re.search(pattern, ann_description) 7906 if match: 7907 ann_header_match = match.group(1).split(" | ") 7908 ann_header_desc = {} 7909 for i in range(len(ann_header_match)): 7910 ann_header_info = "".join( 7911 char for char in ann_header_match[i] if char.isalnum() 7912 ) 7913 ann_header_desc[ann_header_info] = ann_header_match[i] 7914 if not ann_header_desc: 7915 raise ValueError("Invalid header description format") 7916 else: 7917 raise ValueError("Invalid header description format") 7918 7919 # Create variant id 7920 variant_id_column = self.get_variant_id_column() 7921 added_columns += [variant_id_column] 7922 7923 # Create dataframe 7924 dataframe_snpeff_hgvs = self.get_query_to_df( 7925 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7926 ) 7927 7928 # Create main NOMEN column 7929 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7930 speff_ann_infos 7931 ].apply( 7932 lambda x: extract_snpeff_hgvs( 7933 str(x), header=list(ann_header_desc.values()) 7934 ) 7935 ) 7936 7937 # Add snpeff_hgvs to header 7938 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7939 snpeff_hgvs, 7940 ".", 7941 "String", 7942 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7943 "howard calculation", 7944 "0", 7945 self.code_type_map.get("String"), 7946 ) 7947 7948 # Update 7949 sql_update = f""" 7950 UPDATE variants 7951 SET "INFO" = 7952 concat( 7953 CASE 7954 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7955 THEN '' 7956 ELSE concat("INFO", ';') 7957 END, 7958 CASE 7959 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7960 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7961 THEN concat( 7962 '{snpeff_hgvs}=', 7963 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7964 ) 7965 ELSE '' 7966 END 7967 ) 7968 FROM dataframe_snpeff_hgvs 7969 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7970 7971 """ 7972 self.conn.execute(sql_update) 7973 7974 # Delete dataframe 7975 del dataframe_snpeff_hgvs 7976 gc.collect() 7977 7978 else: 7979 7980 log.warning( 7981 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7982 ) 7983 7984 # Remove added columns 7985 for added_column in added_columns: 7986 self.drop_column(column=added_column) 7987 7988 def calculation_snpeff_ann_explode( 7989 self, 7990 uniquify: bool = True, 7991 output_format: str = "fields", 7992 output_prefix: str = "snpeff_", 7993 snpeff_field: str = "ANN", 7994 ) -> None: 7995 """ 7996 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7997 exploding the HGVS field and updating variant information accordingly. 7998 7999 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8000 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8001 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8002 defaults to True 8003 :type uniquify: bool (optional) 8004 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8005 function specifies the format in which the output annotations will be generated. It has a 8006 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8007 format, defaults to fields 8008 :type output_format: str (optional) 8009 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8010 method is used to specify the prefix that will be added to the output annotations generated 8011 during the calculation process. This prefix helps to differentiate the newly added annotations 8012 from existing ones in the output data. By default, the, defaults to ANN_ 8013 :type output_prefix: str (optional) 8014 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8015 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8016 field will be processed to explode the HGVS annotations and update the variant information 8017 accordingly, defaults to ANN 8018 :type snpeff_field: str (optional) 8019 """ 8020 8021 # SnpEff annotation field 8022 snpeff_hgvs = "snpeff_ann_explode" 8023 8024 # Snpeff hgvs tags 8025 vcf_infos_tags = { 8026 snpeff_hgvs: "Explode snpEff annotations", 8027 } 8028 8029 # Prefix 8030 prefix = self.get_explode_infos_prefix() 8031 if prefix: 8032 prefix = "INFO/" 8033 8034 # snpEff fields 8035 speff_ann_infos = prefix + snpeff_field 8036 speff_hgvs_infos = prefix + snpeff_hgvs 8037 8038 # Variants table 8039 table_variants = self.get_table_variants() 8040 8041 # Header 8042 vcf_reader = self.get_header() 8043 8044 # Add columns 8045 added_columns = [] 8046 8047 # Explode HGVS field in column 8048 added_columns += self.explode_infos(fields=[snpeff_field]) 8049 log.debug(f"snpeff_field={snpeff_field}") 8050 log.debug(f"added_columns={added_columns}") 8051 8052 if snpeff_field in vcf_reader.infos: 8053 8054 # Extract ANN header 8055 ann_description = vcf_reader.infos[snpeff_field].desc 8056 pattern = r"'(.+?)'" 8057 match = re.search(pattern, ann_description) 8058 if match: 8059 ann_header_match = match.group(1).split(" | ") 8060 ann_header = [] 8061 ann_header_desc = {} 8062 for i in range(len(ann_header_match)): 8063 ann_header_info = "".join( 8064 char for char in ann_header_match[i] if char.isalnum() 8065 ) 8066 ann_header.append(ann_header_info) 8067 ann_header_desc[ann_header_info] = ann_header_match[i] 8068 if not ann_header_desc: 8069 raise ValueError("Invalid header description format") 8070 else: 8071 raise ValueError("Invalid header description format") 8072 8073 # Create variant id 8074 variant_id_column = self.get_variant_id_column() 8075 added_columns += [variant_id_column] 8076 8077 # Create dataframe 8078 dataframe_snpeff_hgvs = self.get_query_to_df( 8079 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8080 ) 8081 8082 # Create snpEff columns 8083 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8084 speff_ann_infos 8085 ].apply( 8086 lambda x: explode_snpeff_ann( 8087 str(x), 8088 uniquify=uniquify, 8089 output_format=output_format, 8090 prefix=output_prefix, 8091 header=list(ann_header_desc.values()), 8092 ) 8093 ) 8094 8095 # Header 8096 ann_annotations_prefix = "" 8097 if output_format.upper() in ["JSON"]: 8098 ann_annotations_prefix = f"{output_prefix}=" 8099 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8100 output_prefix, 8101 ".", 8102 "String", 8103 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8104 + " - JSON format", 8105 "howard calculation", 8106 "0", 8107 self.code_type_map.get("String"), 8108 ) 8109 else: 8110 for ann_annotation in ann_header: 8111 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8112 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8113 ann_annotation_id, 8114 ".", 8115 "String", 8116 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8117 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8118 "howard calculation", 8119 "0", 8120 self.code_type_map.get("String"), 8121 ) 8122 8123 # Update 8124 sql_update = f""" 8125 UPDATE variants 8126 SET "INFO" = 8127 concat( 8128 CASE 8129 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8130 THEN '' 8131 ELSE concat("INFO", ';') 8132 END, 8133 CASE 8134 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8135 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8136 THEN concat( 8137 '{ann_annotations_prefix}', 8138 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8139 ) 8140 ELSE '' 8141 END 8142 ) 8143 FROM dataframe_snpeff_hgvs 8144 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8145 8146 """ 8147 self.conn.execute(sql_update) 8148 8149 # Delete dataframe 8150 del dataframe_snpeff_hgvs 8151 gc.collect() 8152 8153 else: 8154 8155 log.warning( 8156 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8157 ) 8158 8159 # Remove added columns 8160 for added_column in added_columns: 8161 self.drop_column(column=added_column) 8162 8163 def calculation_extract_nomen(self) -> None: 8164 """ 8165 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8166 """ 8167 8168 # NOMEN field 8169 field_nomen_dict = "NOMEN_DICT" 8170 8171 # NOMEN structure 8172 nomen_dict = { 8173 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8174 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8175 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8176 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8177 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8178 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8179 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8180 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8181 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8182 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8183 } 8184 8185 # Param 8186 param = self.get_param() 8187 8188 # Prefix 8189 prefix = self.get_explode_infos_prefix() 8190 8191 # Header 8192 vcf_reader = self.get_header() 8193 8194 # Get HGVS field 8195 hgvs_field = ( 8196 param.get("calculation", {}) 8197 .get("calculations", {}) 8198 .get("NOMEN", {}) 8199 .get("options", {}) 8200 .get("hgvs_field", "hgvs") 8201 ) 8202 8203 # Get transcripts 8204 transcripts_file = ( 8205 param.get("calculation", {}) 8206 .get("calculations", {}) 8207 .get("NOMEN", {}) 8208 .get("options", {}) 8209 .get("transcripts", None) 8210 ) 8211 transcripts_file = full_path(transcripts_file) 8212 transcripts = [] 8213 if transcripts_file: 8214 if os.path.exists(transcripts_file): 8215 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8216 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8217 else: 8218 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8219 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8220 8221 # Added columns 8222 added_columns = [] 8223 8224 # Explode HGVS field in column 8225 added_columns += self.explode_infos(fields=[hgvs_field]) 8226 8227 # extra infos 8228 extra_infos = self.get_extra_infos() 8229 extra_field = prefix + hgvs_field 8230 8231 if extra_field in extra_infos: 8232 8233 # Create dataframe 8234 dataframe_hgvs = self.get_query_to_df( 8235 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8236 ) 8237 8238 # Create main NOMEN column 8239 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8240 lambda x: find_nomen(str(x), transcripts=transcripts) 8241 ) 8242 8243 # Explode NOMEN Structure and create SQL set for update 8244 sql_nomen_fields = [] 8245 for nomen_field in nomen_dict: 8246 8247 # Explode each field into a column 8248 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8249 lambda x: dict(x).get(nomen_field, "") 8250 ) 8251 8252 # Create VCF header field 8253 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8254 nomen_field, 8255 ".", 8256 "String", 8257 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8258 "howard calculation", 8259 "0", 8260 self.code_type_map.get("String"), 8261 ) 8262 sql_nomen_fields.append( 8263 f""" 8264 CASE 8265 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8266 THEN concat( 8267 ';{nomen_field}=', 8268 dataframe_hgvs."{nomen_field}" 8269 ) 8270 ELSE '' 8271 END 8272 """ 8273 ) 8274 8275 # SQL set for update 8276 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8277 8278 # Update 8279 sql_update = f""" 8280 UPDATE variants 8281 SET "INFO" = 8282 concat( 8283 CASE 8284 WHEN "INFO" IS NULL 8285 THEN '' 8286 ELSE "INFO" 8287 END, 8288 {sql_nomen_fields_set} 8289 ) 8290 FROM dataframe_hgvs 8291 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8292 AND variants."POS" = dataframe_hgvs."POS" 8293 AND variants."REF" = dataframe_hgvs."REF" 8294 AND variants."ALT" = dataframe_hgvs."ALT" 8295 """ 8296 self.conn.execute(sql_update) 8297 8298 # Delete dataframe 8299 del dataframe_hgvs 8300 gc.collect() 8301 8302 # Remove added columns 8303 for added_column in added_columns: 8304 self.drop_column(column=added_column) 8305 8306 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8307 """ 8308 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8309 pipeline/sample for a variant and updates the variant information in a VCF file. 8310 8311 :param tag: The `tag` parameter is a string that represents the annotation field for the 8312 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8313 VCF header and to update the corresponding field in the variants table, defaults to 8314 findbypipeline 8315 :type tag: str (optional) 8316 """ 8317 8318 # if FORMAT and samples 8319 if ( 8320 "FORMAT" in self.get_header_columns_as_list() 8321 and self.get_header_sample_list() 8322 ): 8323 8324 # findbypipeline annotation field 8325 findbypipeline_tag = tag 8326 8327 # VCF infos tags 8328 vcf_infos_tags = { 8329 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8330 } 8331 8332 # Prefix 8333 prefix = self.get_explode_infos_prefix() 8334 8335 # Field 8336 findbypipeline_infos = prefix + findbypipeline_tag 8337 8338 # Variants table 8339 table_variants = self.get_table_variants() 8340 8341 # Header 8342 vcf_reader = self.get_header() 8343 8344 # Create variant id 8345 variant_id_column = self.get_variant_id_column() 8346 added_columns = [variant_id_column] 8347 8348 # variant_id, FORMAT and samples 8349 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8350 self.get_header_sample_list() 8351 ) 8352 8353 # Create dataframe 8354 dataframe_findbypipeline = self.get_query_to_df( 8355 f""" SELECT {samples_fields} FROM {table_variants} """ 8356 ) 8357 8358 # Create findbypipeline column 8359 dataframe_findbypipeline[findbypipeline_infos] = ( 8360 dataframe_findbypipeline.apply( 8361 lambda row: findbypipeline( 8362 row, samples=self.get_header_sample_list() 8363 ), 8364 axis=1, 8365 ) 8366 ) 8367 8368 # Add snpeff_hgvs to header 8369 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8370 findbypipeline_tag, 8371 ".", 8372 "String", 8373 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8374 "howard calculation", 8375 "0", 8376 self.code_type_map.get("String"), 8377 ) 8378 8379 # Update 8380 sql_update = f""" 8381 UPDATE variants 8382 SET "INFO" = 8383 concat( 8384 CASE 8385 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8386 THEN '' 8387 ELSE concat("INFO", ';') 8388 END, 8389 CASE 8390 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8391 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8392 THEN concat( 8393 '{findbypipeline_tag}=', 8394 dataframe_findbypipeline."{findbypipeline_infos}" 8395 ) 8396 ELSE '' 8397 END 8398 ) 8399 FROM dataframe_findbypipeline 8400 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8401 """ 8402 self.conn.execute(sql_update) 8403 8404 # Remove added columns 8405 for added_column in added_columns: 8406 self.drop_column(column=added_column) 8407 8408 # Delete dataframe 8409 del dataframe_findbypipeline 8410 gc.collect() 8411 8412 def calculation_genotype_concordance(self) -> None: 8413 """ 8414 The function `calculation_genotype_concordance` calculates the genotype concordance for 8415 multi-caller VCF files and updates the variant information in the database. 8416 """ 8417 8418 # if FORMAT and samples 8419 if ( 8420 "FORMAT" in self.get_header_columns_as_list() 8421 and self.get_header_sample_list() 8422 ): 8423 8424 # genotypeconcordance annotation field 8425 genotypeconcordance_tag = "genotypeconcordance" 8426 8427 # VCF infos tags 8428 vcf_infos_tags = { 8429 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8430 } 8431 8432 # Prefix 8433 prefix = self.get_explode_infos_prefix() 8434 8435 # Field 8436 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8437 8438 # Variants table 8439 table_variants = self.get_table_variants() 8440 8441 # Header 8442 vcf_reader = self.get_header() 8443 8444 # Create variant id 8445 variant_id_column = self.get_variant_id_column() 8446 added_columns = [variant_id_column] 8447 8448 # variant_id, FORMAT and samples 8449 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8450 self.get_header_sample_list() 8451 ) 8452 8453 # Create dataframe 8454 dataframe_genotypeconcordance = self.get_query_to_df( 8455 f""" SELECT {samples_fields} FROM {table_variants} """ 8456 ) 8457 8458 # Create genotypeconcordance column 8459 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8460 dataframe_genotypeconcordance.apply( 8461 lambda row: genotypeconcordance( 8462 row, samples=self.get_header_sample_list() 8463 ), 8464 axis=1, 8465 ) 8466 ) 8467 8468 # Add genotypeconcordance to header 8469 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8470 genotypeconcordance_tag, 8471 ".", 8472 "String", 8473 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8474 "howard calculation", 8475 "0", 8476 self.code_type_map.get("String"), 8477 ) 8478 8479 # Update 8480 sql_update = f""" 8481 UPDATE variants 8482 SET "INFO" = 8483 concat( 8484 CASE 8485 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8486 THEN '' 8487 ELSE concat("INFO", ';') 8488 END, 8489 CASE 8490 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8491 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8492 THEN concat( 8493 '{genotypeconcordance_tag}=', 8494 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8495 ) 8496 ELSE '' 8497 END 8498 ) 8499 FROM dataframe_genotypeconcordance 8500 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8501 """ 8502 self.conn.execute(sql_update) 8503 8504 # Remove added columns 8505 for added_column in added_columns: 8506 self.drop_column(column=added_column) 8507 8508 # Delete dataframe 8509 del dataframe_genotypeconcordance 8510 gc.collect() 8511 8512 def calculation_barcode(self, tag: str = "barcode") -> None: 8513 """ 8514 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8515 updates the INFO field in the file with the calculated barcode values. 8516 8517 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8518 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8519 the default tag name is set to "barcode", defaults to barcode 8520 :type tag: str (optional) 8521 """ 8522 8523 # if FORMAT and samples 8524 if ( 8525 "FORMAT" in self.get_header_columns_as_list() 8526 and self.get_header_sample_list() 8527 ): 8528 8529 # barcode annotation field 8530 if not tag: 8531 tag = "barcode" 8532 8533 # VCF infos tags 8534 vcf_infos_tags = { 8535 tag: "barcode calculation (VaRank)", 8536 } 8537 8538 # Prefix 8539 prefix = self.get_explode_infos_prefix() 8540 8541 # Field 8542 barcode_infos = prefix + tag 8543 8544 # Variants table 8545 table_variants = self.get_table_variants() 8546 8547 # Header 8548 vcf_reader = self.get_header() 8549 8550 # Create variant id 8551 variant_id_column = self.get_variant_id_column() 8552 added_columns = [variant_id_column] 8553 8554 # variant_id, FORMAT and samples 8555 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8556 self.get_header_sample_list() 8557 ) 8558 8559 # Create dataframe 8560 dataframe_barcode = self.get_query_to_df( 8561 f""" SELECT {samples_fields} FROM {table_variants} """ 8562 ) 8563 8564 # Create barcode column 8565 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8566 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8567 ) 8568 8569 # Add barcode to header 8570 vcf_reader.infos[tag] = vcf.parser._Info( 8571 tag, 8572 ".", 8573 "String", 8574 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8575 "howard calculation", 8576 "0", 8577 self.code_type_map.get("String"), 8578 ) 8579 8580 # Update 8581 sql_update = f""" 8582 UPDATE {table_variants} 8583 SET "INFO" = 8584 concat( 8585 CASE 8586 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8587 THEN '' 8588 ELSE concat("INFO", ';') 8589 END, 8590 CASE 8591 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8592 AND dataframe_barcode."{barcode_infos}" NOT NULL 8593 THEN concat( 8594 '{tag}=', 8595 dataframe_barcode."{barcode_infos}" 8596 ) 8597 ELSE '' 8598 END 8599 ) 8600 FROM dataframe_barcode 8601 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8602 """ 8603 self.conn.execute(sql_update) 8604 8605 # Remove added columns 8606 for added_column in added_columns: 8607 self.drop_column(column=added_column) 8608 8609 # Delete dataframe 8610 del dataframe_barcode 8611 gc.collect() 8612 8613 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8614 """ 8615 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8616 and updates the INFO field in the file with the calculated barcode values. 8617 8618 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8619 the barcode tag that will be added to the VCF file during the calculation process. If no value 8620 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8621 :type tag: str (optional) 8622 """ 8623 8624 # if FORMAT and samples 8625 if ( 8626 "FORMAT" in self.get_header_columns_as_list() 8627 and self.get_header_sample_list() 8628 ): 8629 8630 # barcode annotation field 8631 if not tag: 8632 tag = "BCF" 8633 8634 # VCF infos tags 8635 vcf_infos_tags = { 8636 tag: "barcode family calculation", 8637 f"{tag}S": "barcode family samples", 8638 } 8639 8640 # Param 8641 param = self.get_param() 8642 log.debug(f"param={param}") 8643 8644 # Prefix 8645 prefix = self.get_explode_infos_prefix() 8646 8647 # PED param 8648 ped = ( 8649 param.get("calculation", {}) 8650 .get("calculations", {}) 8651 .get("BARCODEFAMILY", {}) 8652 .get("family_pedigree", None) 8653 ) 8654 log.debug(f"ped={ped}") 8655 8656 # Load PED 8657 if ped: 8658 8659 # Pedigree is a file 8660 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8661 log.debug("Pedigree is file") 8662 with open(full_path(ped)) as ped: 8663 ped = json.load(ped) 8664 8665 # Pedigree is a string 8666 elif isinstance(ped, str): 8667 log.debug("Pedigree is str") 8668 try: 8669 ped = json.loads(ped) 8670 log.debug("Pedigree is json str") 8671 except ValueError as e: 8672 ped_samples = ped.split(",") 8673 ped = {} 8674 for ped_sample in ped_samples: 8675 ped[ped_sample] = ped_sample 8676 8677 # Pedigree is a dict 8678 elif isinstance(ped, dict): 8679 log.debug("Pedigree is dict") 8680 8681 # Pedigree is not well formatted 8682 else: 8683 msg_error = "Pedigree not well formatted" 8684 log.error(msg_error) 8685 raise ValueError(msg_error) 8686 8687 # Construct list 8688 ped_samples = list(ped.values()) 8689 8690 else: 8691 log.debug("Pedigree not defined. Take all samples") 8692 ped_samples = self.get_header_sample_list() 8693 ped = {} 8694 for ped_sample in ped_samples: 8695 ped[ped_sample] = ped_sample 8696 8697 # Check pedigree 8698 if not ped or len(ped) == 0: 8699 msg_error = f"Error in pedigree: samples {ped_samples}" 8700 log.error(msg_error) 8701 raise ValueError(msg_error) 8702 8703 # Log 8704 log.info( 8705 "Calculation 'BARCODEFAMILY' - Samples: " 8706 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8707 ) 8708 log.debug(f"ped_samples={ped_samples}") 8709 8710 # Field 8711 barcode_infos = prefix + tag 8712 8713 # Variants table 8714 table_variants = self.get_table_variants() 8715 8716 # Header 8717 vcf_reader = self.get_header() 8718 8719 # Create variant id 8720 variant_id_column = self.get_variant_id_column() 8721 added_columns = [variant_id_column] 8722 8723 # variant_id, FORMAT and samples 8724 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8725 ped_samples 8726 ) 8727 8728 # Create dataframe 8729 dataframe_barcode = self.get_query_to_df( 8730 f""" SELECT {samples_fields} FROM {table_variants} """ 8731 ) 8732 8733 # Create barcode column 8734 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8735 lambda row: barcode(row, samples=ped_samples), axis=1 8736 ) 8737 8738 # Add barcode family to header 8739 # Add vaf_normalization to header 8740 vcf_reader.formats[tag] = vcf.parser._Format( 8741 id=tag, 8742 num=".", 8743 type="String", 8744 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8745 type_code=self.code_type_map.get("String"), 8746 ) 8747 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8748 id=f"{tag}S", 8749 num=".", 8750 type="String", 8751 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8752 type_code=self.code_type_map.get("String"), 8753 ) 8754 8755 # Update 8756 # for sample in ped_samples: 8757 sql_update_set = [] 8758 for sample in self.get_header_sample_list() + ["FORMAT"]: 8759 if sample in ped_samples: 8760 value = f'dataframe_barcode."{barcode_infos}"' 8761 value_samples = "'" + ",".join(ped_samples) + "'" 8762 elif sample == "FORMAT": 8763 value = f"'{tag}'" 8764 value_samples = f"'{tag}S'" 8765 else: 8766 value = "'.'" 8767 value_samples = "'.'" 8768 format_regex = r"[a-zA-Z0-9\s]" 8769 sql_update_set.append( 8770 f""" 8771 "{sample}" = 8772 concat( 8773 CASE 8774 WHEN {table_variants}."{sample}" = './.' 8775 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8776 ELSE {table_variants}."{sample}" 8777 END, 8778 ':', 8779 {value}, 8780 ':', 8781 {value_samples} 8782 ) 8783 """ 8784 ) 8785 8786 sql_update_set_join = ", ".join(sql_update_set) 8787 sql_update = f""" 8788 UPDATE {table_variants} 8789 SET {sql_update_set_join} 8790 FROM dataframe_barcode 8791 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8792 """ 8793 self.conn.execute(sql_update) 8794 8795 # Remove added columns 8796 for added_column in added_columns: 8797 self.drop_column(column=added_column) 8798 8799 # Delete dataframe 8800 del dataframe_barcode 8801 gc.collect() 8802 8803 def calculation_trio(self) -> None: 8804 """ 8805 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8806 information to the INFO field of each variant. 8807 """ 8808 8809 # if FORMAT and samples 8810 if ( 8811 "FORMAT" in self.get_header_columns_as_list() 8812 and self.get_header_sample_list() 8813 ): 8814 8815 # trio annotation field 8816 trio_tag = "trio" 8817 8818 # VCF infos tags 8819 vcf_infos_tags = { 8820 "trio": "trio calculation", 8821 } 8822 8823 # Param 8824 param = self.get_param() 8825 8826 # Prefix 8827 prefix = self.get_explode_infos_prefix() 8828 8829 # Trio param 8830 trio_ped = ( 8831 param.get("calculation", {}) 8832 .get("calculations", {}) 8833 .get("TRIO", {}) 8834 .get("trio_pedigree", None) 8835 ) 8836 8837 # Load trio 8838 if trio_ped: 8839 8840 # Trio pedigree is a file 8841 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8842 log.debug("TRIO pedigree is file") 8843 with open(full_path(trio_ped)) as trio_ped: 8844 trio_ped = json.load(trio_ped) 8845 8846 # Trio pedigree is a string 8847 elif isinstance(trio_ped, str): 8848 log.debug("TRIO pedigree is str") 8849 try: 8850 trio_ped = json.loads(trio_ped) 8851 log.debug("TRIO pedigree is json str") 8852 except ValueError as e: 8853 trio_samples = trio_ped.split(",") 8854 if len(trio_samples) == 3: 8855 trio_ped = { 8856 "father": trio_samples[0], 8857 "mother": trio_samples[1], 8858 "child": trio_samples[2], 8859 } 8860 log.debug("TRIO pedigree is list str") 8861 else: 8862 msg_error = "TRIO pedigree not well formatted" 8863 log.error(msg_error) 8864 raise ValueError(msg_error) 8865 8866 # Trio pedigree is a dict 8867 elif isinstance(trio_ped, dict): 8868 log.debug("TRIO pedigree is dict") 8869 8870 # Trio pedigree is not well formatted 8871 else: 8872 msg_error = "TRIO pedigree not well formatted" 8873 log.error(msg_error) 8874 raise ValueError(msg_error) 8875 8876 # Construct trio list 8877 trio_samples = [ 8878 trio_ped.get("father", ""), 8879 trio_ped.get("mother", ""), 8880 trio_ped.get("child", ""), 8881 ] 8882 8883 else: 8884 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8885 samples_list = self.get_header_sample_list() 8886 if len(samples_list) >= 3: 8887 trio_samples = self.get_header_sample_list()[0:3] 8888 trio_ped = { 8889 "father": trio_samples[0], 8890 "mother": trio_samples[1], 8891 "child": trio_samples[2], 8892 } 8893 else: 8894 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8895 log.error(msg_error) 8896 raise ValueError(msg_error) 8897 8898 # Check trio pedigree 8899 if not trio_ped or len(trio_ped) != 3: 8900 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8901 log.error(msg_error) 8902 raise ValueError(msg_error) 8903 8904 # Log 8905 log.info( 8906 f"Calculation 'TRIO' - Samples: " 8907 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8908 ) 8909 8910 # Field 8911 trio_infos = prefix + trio_tag 8912 8913 # Variants table 8914 table_variants = self.get_table_variants() 8915 8916 # Header 8917 vcf_reader = self.get_header() 8918 8919 # Create variant id 8920 variant_id_column = self.get_variant_id_column() 8921 added_columns = [variant_id_column] 8922 8923 # variant_id, FORMAT and samples 8924 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8925 self.get_header_sample_list() 8926 ) 8927 8928 # Create dataframe 8929 dataframe_trio = self.get_query_to_df( 8930 f""" SELECT {samples_fields} FROM {table_variants} """ 8931 ) 8932 8933 # Create trio column 8934 dataframe_trio[trio_infos] = dataframe_trio.apply( 8935 lambda row: trio(row, samples=trio_samples), axis=1 8936 ) 8937 8938 # Add trio to header 8939 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8940 trio_tag, 8941 ".", 8942 "String", 8943 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8944 "howard calculation", 8945 "0", 8946 self.code_type_map.get("String"), 8947 ) 8948 8949 # Update 8950 sql_update = f""" 8951 UPDATE {table_variants} 8952 SET "INFO" = 8953 concat( 8954 CASE 8955 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8956 THEN '' 8957 ELSE concat("INFO", ';') 8958 END, 8959 CASE 8960 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8961 AND dataframe_trio."{trio_infos}" NOT NULL 8962 THEN concat( 8963 '{trio_tag}=', 8964 dataframe_trio."{trio_infos}" 8965 ) 8966 ELSE '' 8967 END 8968 ) 8969 FROM dataframe_trio 8970 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8971 """ 8972 self.conn.execute(sql_update) 8973 8974 # Remove added columns 8975 for added_column in added_columns: 8976 self.drop_column(column=added_column) 8977 8978 # Delete dataframe 8979 del dataframe_trio 8980 gc.collect() 8981 8982 def calculation_vaf_normalization(self) -> None: 8983 """ 8984 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8985 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8986 :return: The function does not return anything. 8987 """ 8988 8989 # if FORMAT and samples 8990 if ( 8991 "FORMAT" in self.get_header_columns_as_list() 8992 and self.get_header_sample_list() 8993 ): 8994 8995 # vaf_normalization annotation field 8996 vaf_normalization_tag = "VAF" 8997 8998 # VCF infos tags 8999 vcf_infos_tags = { 9000 "VAF": "VAF Variant Frequency", 9001 } 9002 9003 # Prefix 9004 prefix = self.get_explode_infos_prefix() 9005 9006 # Variants table 9007 table_variants = self.get_table_variants() 9008 9009 # Header 9010 vcf_reader = self.get_header() 9011 9012 # Do not calculate if VAF already exists 9013 if "VAF" in vcf_reader.formats: 9014 log.debug("VAF already on genotypes") 9015 return 9016 9017 # Create variant id 9018 variant_id_column = self.get_variant_id_column() 9019 added_columns = [variant_id_column] 9020 9021 # variant_id, FORMAT and samples 9022 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9023 f""" "{sample}" """ for sample in self.get_header_sample_list() 9024 ) 9025 9026 # Create dataframe 9027 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9028 log.debug(f"query={query}") 9029 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9030 9031 vaf_normalization_set = [] 9032 9033 # for each sample vaf_normalization 9034 for sample in self.get_header_sample_list(): 9035 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9036 lambda row: vaf_normalization(row, sample=sample), axis=1 9037 ) 9038 vaf_normalization_set.append( 9039 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9040 ) 9041 9042 # Add VAF to FORMAT 9043 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9044 "FORMAT" 9045 ].apply(lambda x: str(x) + ":VAF") 9046 vaf_normalization_set.append( 9047 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9048 ) 9049 9050 # Add vaf_normalization to header 9051 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9052 id=vaf_normalization_tag, 9053 num="1", 9054 type="Float", 9055 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9056 type_code=self.code_type_map.get("Float"), 9057 ) 9058 9059 # Create fields to add in INFO 9060 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9061 9062 # Update 9063 sql_update = f""" 9064 UPDATE {table_variants} 9065 SET {sql_vaf_normalization_set} 9066 FROM dataframe_vaf_normalization 9067 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9068 9069 """ 9070 self.conn.execute(sql_update) 9071 9072 # Remove added columns 9073 for added_column in added_columns: 9074 self.drop_column(column=added_column) 9075 9076 # Delete dataframe 9077 del dataframe_vaf_normalization 9078 gc.collect() 9079 9080 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9081 """ 9082 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9083 field in a VCF file and updates the INFO column of the variants table with the calculated 9084 statistics. 9085 9086 :param info: The `info` parameter is a string that represents the type of information for which 9087 genotype statistics are calculated. It is used to generate various VCF info tags for the 9088 statistics, such as the number of occurrences, the list of values, the minimum value, the 9089 maximum value, the mean, the median, defaults to VAF 9090 :type info: str (optional) 9091 """ 9092 9093 # if FORMAT and samples 9094 if ( 9095 "FORMAT" in self.get_header_columns_as_list() 9096 and self.get_header_sample_list() 9097 ): 9098 9099 # vaf_stats annotation field 9100 vaf_stats_tag = info + "_stats" 9101 9102 # VCF infos tags 9103 vcf_infos_tags = { 9104 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9105 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9106 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9107 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9108 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9109 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9110 info 9111 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9112 } 9113 9114 # Prefix 9115 prefix = self.get_explode_infos_prefix() 9116 9117 # Field 9118 vaf_stats_infos = prefix + vaf_stats_tag 9119 9120 # Variants table 9121 table_variants = self.get_table_variants() 9122 9123 # Header 9124 vcf_reader = self.get_header() 9125 9126 # Create variant id 9127 variant_id_column = self.get_variant_id_column() 9128 added_columns = [variant_id_column] 9129 9130 # variant_id, FORMAT and samples 9131 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9132 self.get_header_sample_list() 9133 ) 9134 9135 # Create dataframe 9136 dataframe_vaf_stats = self.get_query_to_df( 9137 f""" SELECT {samples_fields} FROM {table_variants} """ 9138 ) 9139 9140 # Create vaf_stats column 9141 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9142 lambda row: genotype_stats( 9143 row, samples=self.get_header_sample_list(), info=info 9144 ), 9145 axis=1, 9146 ) 9147 9148 # List of vcf tags 9149 sql_vaf_stats_fields = [] 9150 9151 # Check all VAF stats infos 9152 for stat in vcf_infos_tags: 9153 9154 # Extract stats 9155 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9156 lambda x: dict(x).get(stat, "") 9157 ) 9158 9159 # Add snpeff_hgvs to header 9160 vcf_reader.infos[stat] = vcf.parser._Info( 9161 stat, 9162 ".", 9163 "String", 9164 vcf_infos_tags.get(stat, "genotype statistics"), 9165 "howard calculation", 9166 "0", 9167 self.code_type_map.get("String"), 9168 ) 9169 9170 if len(sql_vaf_stats_fields): 9171 sep = ";" 9172 else: 9173 sep = "" 9174 9175 # Create fields to add in INFO 9176 sql_vaf_stats_fields.append( 9177 f""" 9178 CASE 9179 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9180 THEN concat( 9181 '{sep}{stat}=', 9182 dataframe_vaf_stats."{stat}" 9183 ) 9184 ELSE '' 9185 END 9186 """ 9187 ) 9188 9189 # SQL set for update 9190 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9191 9192 # Update 9193 sql_update = f""" 9194 UPDATE {table_variants} 9195 SET "INFO" = 9196 concat( 9197 CASE 9198 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9199 THEN '' 9200 ELSE concat("INFO", ';') 9201 END, 9202 {sql_vaf_stats_fields_set} 9203 ) 9204 FROM dataframe_vaf_stats 9205 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9206 9207 """ 9208 self.conn.execute(sql_update) 9209 9210 # Remove added columns 9211 for added_column in added_columns: 9212 self.drop_column(column=added_column) 9213 9214 # Delete dataframe 9215 del dataframe_vaf_stats 9216 gc.collect() 9217 9218 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9219 """ 9220 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9221 to it if transcripts are available. 9222 9223 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9224 parameter that specifies the information field to be used in the transcripts JSON. It has a 9225 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9226 transcripts_json 9227 :type info: str (optional) 9228 """ 9229 9230 # Create transcripts table 9231 transcripts_table = self.create_transcript_view() 9232 9233 # Add info field 9234 if transcripts_table: 9235 self.transcript_view_to_variants( 9236 transcripts_table=transcripts_table, transcripts_info_field=info 9237 ) 9238 else: 9239 log.info("No Transcripts to process. Check param.json file configuration") 9240 9241 ############### 9242 # Transcripts # 9243 ############### 9244 9245 def create_transcript_view_from_columns_map( 9246 self, 9247 transcripts_table: str = "transcripts", 9248 columns_maps: dict = {}, 9249 added_columns: list = [], 9250 temporary_tables: list = None, 9251 annotation_fields: list = None, 9252 ) -> tuple[list, list, list]: 9253 """ 9254 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9255 specified columns mapping for transcripts data. 9256 9257 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9258 the table where the transcripts data is stored or will be stored in the database. This table 9259 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9260 predictions, etc. It defaults to "transcripts, defaults to transcripts 9261 :type transcripts_table: str (optional) 9262 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9263 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9264 represents a mapping configuration for a specific set of columns. It typically includes details such 9265 as the main transcript column and additional information columns 9266 :type columns_maps: dict 9267 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9268 function is a list that stores the additional columns that will be added to the view being created 9269 based on the columns map provided. These columns are generated by exploding the transcript 9270 information columns along with the main transcript column 9271 :type added_columns: list 9272 :param temporary_tables: The `temporary_tables` parameter in the 9273 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9274 tables created during the process of creating a transcript view from a columns map. These temporary 9275 tables are used to store intermediate results or transformations before the final view is generated 9276 :type temporary_tables: list 9277 :param annotation_fields: The `annotation_fields` parameter in the 9278 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9279 for annotation in the query view creation process. These fields are extracted from the 9280 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9281 :type annotation_fields: list 9282 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9283 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9284 """ 9285 9286 log.debug("Start transcrpts view creation from columns map...") 9287 9288 # "from_columns_map": [ 9289 # { 9290 # "transcripts_column": "Ensembl_transcriptid", 9291 # "transcripts_infos_columns": [ 9292 # "genename", 9293 # "Ensembl_geneid", 9294 # "LIST_S2_score", 9295 # "LIST_S2_pred", 9296 # ], 9297 # }, 9298 # { 9299 # "transcripts_column": "Ensembl_transcriptid", 9300 # "transcripts_infos_columns": [ 9301 # "genename", 9302 # "VARITY_R_score", 9303 # "Aloft_pred", 9304 # ], 9305 # }, 9306 # ], 9307 9308 # Init 9309 if temporary_tables is None: 9310 temporary_tables = [] 9311 if annotation_fields is None: 9312 annotation_fields = [] 9313 9314 # Variants table 9315 table_variants = self.get_table_variants() 9316 9317 for columns_map in columns_maps: 9318 9319 # Transcript column 9320 transcripts_column = columns_map.get("transcripts_column", None) 9321 9322 # Transcripts infos columns 9323 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9324 9325 if transcripts_column is not None: 9326 9327 # Explode 9328 added_columns += self.explode_infos( 9329 fields=[transcripts_column] + transcripts_infos_columns 9330 ) 9331 9332 # View clauses 9333 clause_select = [] 9334 for field in [transcripts_column] + transcripts_infos_columns: 9335 clause_select.append( 9336 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9337 ) 9338 if field not in [transcripts_column]: 9339 annotation_fields.append(field) 9340 9341 # Querey View 9342 query = f""" 9343 SELECT 9344 "#CHROM", POS, REF, ALT, 9345 "{transcripts_column}" AS 'transcript', 9346 {", ".join(clause_select)} 9347 FROM ( 9348 SELECT 9349 "#CHROM", POS, REF, ALT, 9350 {", ".join(clause_select)} 9351 FROM {table_variants} 9352 ) 9353 WHERE "{transcripts_column}" IS NOT NULL 9354 """ 9355 9356 # Create temporary table 9357 temporary_table = transcripts_table + "".join( 9358 random.choices(string.ascii_uppercase + string.digits, k=10) 9359 ) 9360 9361 # Temporary_tables 9362 temporary_tables.append(temporary_table) 9363 query_view = f""" 9364 CREATE TEMPORARY TABLE {temporary_table} 9365 AS ({query}) 9366 """ 9367 self.execute_query(query=query_view) 9368 9369 return added_columns, temporary_tables, annotation_fields 9370 9371 def create_transcript_view_from_column_format( 9372 self, 9373 transcripts_table: str = "transcripts", 9374 column_formats: dict = {}, 9375 temporary_tables: list = None, 9376 annotation_fields: list = None, 9377 ) -> tuple[list, list, list]: 9378 """ 9379 The `create_transcript_view_from_column_format` function generates a transcript view based on 9380 specified column formats, adds additional columns and annotation fields, and returns the list of 9381 temporary tables and annotation fields. 9382 9383 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9384 the table containing the transcripts data. This table will be used as the base table for creating 9385 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9386 different table name if needed, defaults to transcripts 9387 :type transcripts_table: str (optional) 9388 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9389 about the columns to be used for creating the transcript view. Each entry in the dictionary 9390 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9391 the provided code snippet: 9392 :type column_formats: dict 9393 :param temporary_tables: The `temporary_tables` parameter in the 9394 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9395 views created during the process of creating a transcript view from a column format. These temporary 9396 views are used to manipulate and extract data before generating the final transcript view. It 9397 :type temporary_tables: list 9398 :param annotation_fields: The `annotation_fields` parameter in the 9399 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9400 that are extracted from the temporary views created during the process. These annotation fields are 9401 obtained by querying the temporary views and extracting the column names excluding specific columns 9402 like `#CH 9403 :type annotation_fields: list 9404 :return: The `create_transcript_view_from_column_format` function returns two lists: 9405 `temporary_tables` and `annotation_fields`. 9406 """ 9407 9408 log.debug("Start transcrpts view creation from column format...") 9409 9410 # "from_column_format": [ 9411 # { 9412 # "transcripts_column": "ANN", 9413 # "transcripts_infos_column": "Feature_ID", 9414 # } 9415 # ], 9416 9417 # Init 9418 if temporary_tables is None: 9419 temporary_tables = [] 9420 if annotation_fields is None: 9421 annotation_fields = [] 9422 9423 for column_format in column_formats: 9424 9425 # annotation field and transcript annotation field 9426 annotation_field = column_format.get("transcripts_column", "ANN") 9427 transcript_annotation = column_format.get( 9428 "transcripts_infos_column", "Feature_ID" 9429 ) 9430 9431 # Temporary View name 9432 temporary_view_name = transcripts_table + "".join( 9433 random.choices(string.ascii_uppercase + string.digits, k=10) 9434 ) 9435 9436 # Create temporary view name 9437 temporary_view_name = self.annotation_format_to_table( 9438 uniquify=True, 9439 annotation_field=annotation_field, 9440 view_name=temporary_view_name, 9441 annotation_id=transcript_annotation, 9442 ) 9443 9444 # Annotation fields 9445 if temporary_view_name: 9446 query_annotation_fields = f""" 9447 SELECT * 9448 FROM ( 9449 DESCRIBE SELECT * 9450 FROM {temporary_view_name} 9451 ) 9452 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9453 """ 9454 df_annotation_fields = self.get_query_to_df( 9455 query=query_annotation_fields 9456 ) 9457 9458 # Add temporary view and annotation fields 9459 temporary_tables.append(temporary_view_name) 9460 annotation_fields += list(set(df_annotation_fields["column_name"])) 9461 9462 return temporary_tables, annotation_fields 9463 9464 def create_transcript_view( 9465 self, 9466 transcripts_table: str = None, 9467 transcripts_table_drop: bool = True, 9468 param: dict = {}, 9469 ) -> str: 9470 """ 9471 The `create_transcript_view` function generates a transcript view by processing data from a 9472 specified table based on provided parameters and structural information. 9473 9474 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9475 is used to specify the name of the table that will store the final transcript view data. If a table 9476 name is not provided, the function will create a new table to store the transcript view data, and by 9477 default,, defaults to transcripts 9478 :type transcripts_table: str (optional) 9479 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9480 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9481 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9482 the function will drop the existing transcripts table if it exists, defaults to True 9483 :type transcripts_table_drop: bool (optional) 9484 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9485 contains information needed to create a transcript view. It includes details such as the structure 9486 of the transcripts, columns mapping, column formats, and other necessary information for generating 9487 the view. This parameter allows for flexibility and customization 9488 :type param: dict 9489 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9490 created or modified during the execution of the function. 9491 """ 9492 9493 log.debug("Start transcrpts view creation...") 9494 9495 # Default 9496 transcripts_table_default = "transcripts" 9497 9498 # Param 9499 if not param: 9500 param = self.get_param() 9501 9502 # Struct 9503 struct = param.get("transcripts", {}).get("struct", None) 9504 9505 if struct: 9506 9507 # Transcripts table 9508 if transcripts_table is None: 9509 transcripts_table = param.get("transcripts", {}).get( 9510 "table", transcripts_table_default 9511 ) 9512 9513 # added_columns 9514 added_columns = [] 9515 9516 # Temporary tables 9517 temporary_tables = [] 9518 9519 # Annotation fields 9520 annotation_fields = [] 9521 9522 # from columns map 9523 columns_maps = struct.get("from_columns_map", []) 9524 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9525 self.create_transcript_view_from_columns_map( 9526 transcripts_table=transcripts_table, 9527 columns_maps=columns_maps, 9528 added_columns=added_columns, 9529 temporary_tables=temporary_tables, 9530 annotation_fields=annotation_fields, 9531 ) 9532 ) 9533 added_columns += added_columns_tmp 9534 temporary_tables += temporary_tables_tmp 9535 annotation_fields += annotation_fields_tmp 9536 9537 # from column format 9538 column_formats = struct.get("from_column_format", []) 9539 temporary_tables_tmp, annotation_fields_tmp = ( 9540 self.create_transcript_view_from_column_format( 9541 transcripts_table=transcripts_table, 9542 column_formats=column_formats, 9543 temporary_tables=temporary_tables, 9544 annotation_fields=annotation_fields, 9545 ) 9546 ) 9547 temporary_tables += temporary_tables_tmp 9548 annotation_fields += annotation_fields_tmp 9549 9550 # Merge temporary tables query 9551 query_merge = "" 9552 for temporary_table in temporary_tables: 9553 9554 # First temporary table 9555 if not query_merge: 9556 query_merge = f""" 9557 SELECT * FROM {temporary_table} 9558 """ 9559 # other temporary table (using UNION) 9560 else: 9561 query_merge += f""" 9562 UNION BY NAME SELECT * FROM {temporary_table} 9563 """ 9564 9565 # Merge on transcript 9566 query_merge_on_transcripts_annotation_fields = [] 9567 # Aggregate all annotations fields 9568 for annotation_field in set(annotation_fields): 9569 query_merge_on_transcripts_annotation_fields.append( 9570 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9571 ) 9572 # Query for transcripts view 9573 query_merge_on_transcripts = f""" 9574 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9575 FROM ({query_merge}) 9576 GROUP BY "#CHROM", POS, REF, ALT, transcript 9577 """ 9578 9579 # Drop transcript view is necessary 9580 if transcripts_table_drop: 9581 query_drop = f""" 9582 DROP TABLE IF EXISTS {transcripts_table}; 9583 """ 9584 self.execute_query(query=query_drop) 9585 9586 # Merge and create transcript view 9587 query_create_view = f""" 9588 CREATE TABLE IF NOT EXISTS {transcripts_table} 9589 AS {query_merge_on_transcripts} 9590 """ 9591 self.execute_query(query=query_create_view) 9592 9593 # Remove added columns 9594 for added_column in added_columns: 9595 self.drop_column(column=added_column) 9596 9597 else: 9598 9599 transcripts_table = None 9600 9601 return transcripts_table 9602 9603 def annotation_format_to_table( 9604 self, 9605 uniquify: bool = True, 9606 annotation_field: str = "ANN", 9607 annotation_id: str = "Feature_ID", 9608 view_name: str = "transcripts", 9609 ) -> str: 9610 """ 9611 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9612 table format. 9613 9614 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9615 values in the output or not. If set to `True`, the function will make sure that the output values 9616 are unique, defaults to True 9617 :type uniquify: bool (optional) 9618 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9619 contains the annotation information for each variant. This field is used to extract the annotation 9620 details for further processing in the function, defaults to ANN 9621 :type annotation_field: str (optional) 9622 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9623 used to specify the identifier for the annotation feature. This identifier will be used as a column 9624 name in the resulting table or view that is created based on the annotation data. It helps in 9625 uniquely identifying each annotation entry in the, defaults to Feature_ID 9626 :type annotation_id: str (optional) 9627 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9628 specify the name of the temporary table that will be created to store the transformed annotation 9629 data. This table will hold the extracted information from the annotation field in a structured 9630 format for further processing or analysis, defaults to transcripts 9631 :type view_name: str (optional) 9632 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9633 is stored in the variable `view_name`. 9634 """ 9635 9636 # Annotation field 9637 annotation_format = "annotation_explode" 9638 9639 # Transcript annotation 9640 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9641 9642 # Prefix 9643 prefix = self.get_explode_infos_prefix() 9644 if prefix: 9645 prefix = "INFO/" 9646 9647 # Annotation fields 9648 annotation_infos = prefix + annotation_field 9649 annotation_format_infos = prefix + annotation_format 9650 9651 # Variants table 9652 table_variants = self.get_table_variants() 9653 9654 # Header 9655 vcf_reader = self.get_header() 9656 9657 # Add columns 9658 added_columns = [] 9659 9660 # Explode HGVS field in column 9661 added_columns += self.explode_infos(fields=[annotation_field]) 9662 9663 if annotation_field in vcf_reader.infos: 9664 9665 # Extract ANN header 9666 ann_description = vcf_reader.infos[annotation_field].desc 9667 pattern = r"'(.+?)'" 9668 match = re.search(pattern, ann_description) 9669 if match: 9670 ann_header_match = match.group(1).split(" | ") 9671 ann_header = [] 9672 ann_header_desc = {} 9673 for i in range(len(ann_header_match)): 9674 ann_header_info = "".join( 9675 char for char in ann_header_match[i] if char.isalnum() 9676 ) 9677 ann_header.append(ann_header_info) 9678 ann_header_desc[ann_header_info] = ann_header_match[i] 9679 if not ann_header_desc: 9680 raise ValueError("Invalid header description format") 9681 else: 9682 raise ValueError("Invalid header description format") 9683 9684 # Create variant id 9685 variant_id_column = self.get_variant_id_column() 9686 added_columns += [variant_id_column] 9687 9688 # Create dataframe 9689 dataframe_annotation_format = self.get_query_to_df( 9690 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9691 ) 9692 9693 # Create annotation columns 9694 dataframe_annotation_format[ 9695 annotation_format_infos 9696 ] = dataframe_annotation_format[annotation_infos].apply( 9697 lambda x: explode_annotation_format( 9698 annotation=str(x), 9699 uniquify=uniquify, 9700 output_format="JSON", 9701 prefix="", 9702 header=list(ann_header_desc.values()), 9703 ) 9704 ) 9705 9706 # Find keys 9707 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9708 df_keys = self.get_query_to_df(query=query_json) 9709 9710 # Check keys 9711 query_json_key = [] 9712 for _, row in df_keys.iterrows(): 9713 9714 # Key 9715 key = row.iloc[0] 9716 9717 # key_clean 9718 key_clean = "".join(char for char in key if char.isalnum()) 9719 9720 # Type 9721 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9722 9723 # Get DataFrame from query 9724 df_json_type = self.get_query_to_df(query=query_json_type) 9725 9726 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9727 with pd.option_context("future.no_silent_downcasting", True): 9728 df_json_type.fillna(value="", inplace=True) 9729 replace_dict = {None: np.nan, "": np.nan} 9730 df_json_type.replace(replace_dict, inplace=True) 9731 df_json_type.dropna(inplace=True) 9732 9733 # Detect column type 9734 column_type = detect_column_type(df_json_type[key_clean]) 9735 9736 # Append 9737 query_json_key.append( 9738 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9739 ) 9740 9741 # Create view 9742 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9743 self.execute_query(query=query_view) 9744 9745 else: 9746 9747 # Return None 9748 view_name = None 9749 9750 # Remove added columns 9751 for added_column in added_columns: 9752 self.drop_column(column=added_column) 9753 9754 return view_name 9755 9756 def transcript_view_to_variants( 9757 self, 9758 transcripts_table: str = None, 9759 transcripts_column_id: str = None, 9760 transcripts_info_json: str = None, 9761 transcripts_info_field: str = None, 9762 param: dict = {}, 9763 ) -> bool: 9764 """ 9765 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9766 a variants table with information from the transcripts in JSON format. 9767 9768 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 9769 containing the transcripts data. If this parameter is not provided, the function will attempt to 9770 retrieve it from the `param` dictionary or use a default value of "transcripts" 9771 :type transcripts_table: str 9772 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 9773 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 9774 used to match transcripts with variants in the database 9775 :type transcripts_column_id: str 9776 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 9777 the column in the variants table where the transcripts information will be stored in JSON format 9778 :type transcripts_info_json: str 9779 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 9780 in the VCF header that will contain information about transcripts in JSON format. This field will be 9781 added to the VCF header as an INFO field with the specified name 9782 :type transcripts_info_field: str 9783 :param param: The `transcript_view_to_variants` method takes several parameters: 9784 :type param: dict 9785 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 9786 operation is successful and `False` if certain conditions are not met. 9787 """ 9788 9789 log.debug("Start transcripts view to JSON...") 9790 9791 # Default 9792 transcripts_table_default = "transcripts" 9793 transcripts_column_id_default = "transcript" 9794 transcripts_info_json_default = None 9795 transcripts_info_field_default = None 9796 9797 # Param 9798 if not param: 9799 param = self.get_param() 9800 9801 # Transcripts table 9802 if transcripts_table is None: 9803 transcripts_table = param.get("transcripts", {}).get( 9804 "table", transcripts_table_default 9805 ) 9806 9807 # Transcripts column ID 9808 if transcripts_column_id is None: 9809 transcripts_column_id = param.get("transcripts", {}).get( 9810 "column_id", transcripts_column_id_default 9811 ) 9812 9813 # Transcripts info field 9814 if transcripts_info_json is None: 9815 transcripts_info_json = param.get("transcripts", {}).get( 9816 "transcripts_info_json", transcripts_info_json_default 9817 ) 9818 9819 # Transcripts info field 9820 if transcripts_info_field is None: 9821 transcripts_info_field = param.get("transcripts", {}).get( 9822 "transcripts_info_field", transcripts_info_field_default 9823 ) 9824 9825 # Variants table 9826 table_variants = self.get_table_variants() 9827 9828 # Check info columns param 9829 if transcripts_info_json is None and transcripts_info_field is None: 9830 return False 9831 9832 # Transcripts infos columns 9833 query_transcripts_infos_columns = f""" 9834 SELECT * 9835 FROM ( 9836 DESCRIBE SELECT * FROM {transcripts_table} 9837 ) 9838 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 9839 """ 9840 transcripts_infos_columns = list( 9841 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 9842 ) 9843 9844 # View results 9845 clause_select = [] 9846 clause_to_json = [] 9847 for field in transcripts_infos_columns: 9848 clause_select.append( 9849 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9850 ) 9851 clause_to_json.append(f""" '{field}': "{field}" """) 9852 9853 # Update 9854 update_set = [] 9855 9856 # VCF header 9857 vcf_reader = self.get_header() 9858 9859 # Transcripts to info column in JSON 9860 if transcripts_info_json is not None: 9861 9862 # Create column on variants table 9863 self.add_column( 9864 table_name=table_variants, 9865 column_name=transcripts_info_json, 9866 column_type="JSON", 9867 default_value=None, 9868 drop=False, 9869 ) 9870 9871 # Add to update 9872 update_set.append( 9873 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 9874 ) 9875 9876 # Add header 9877 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 9878 transcripts_info_json, 9879 ".", 9880 "String", 9881 "Transcripts in JSON format", 9882 "unknwon", 9883 "unknwon", 9884 self.code_type_map["String"], 9885 ) 9886 9887 # Transcripts to info field in JSON 9888 if transcripts_info_field is not None: 9889 9890 # Add to update 9891 update_set.append( 9892 f""" 9893 INFO = concat( 9894 CASE 9895 WHEN INFO NOT IN ('', '.') 9896 THEN INFO 9897 ELSE '' 9898 END, 9899 CASE 9900 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 9901 THEN concat( 9902 ';{transcripts_info_field}=', 9903 t.{transcripts_info_json} 9904 ) 9905 ELSE '' 9906 END 9907 ) 9908 """ 9909 ) 9910 9911 # Add header 9912 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 9913 transcripts_info_field, 9914 ".", 9915 "String", 9916 "Transcripts in JSON format", 9917 "unknwon", 9918 "unknwon", 9919 self.code_type_map["String"], 9920 ) 9921 9922 # Update query 9923 query_update = f""" 9924 UPDATE {table_variants} 9925 SET {", ".join(update_set)} 9926 FROM 9927 ( 9928 SELECT 9929 "#CHROM", POS, REF, ALT, 9930 concat( 9931 '{{', 9932 string_agg( 9933 '"' || "{transcripts_column_id}" || '":' || 9934 to_json(json_output) 9935 ), 9936 '}}' 9937 )::JSON AS {transcripts_info_json} 9938 FROM 9939 ( 9940 SELECT 9941 "#CHROM", POS, REF, ALT, 9942 "{transcripts_column_id}", 9943 to_json( 9944 {{{",".join(clause_to_json)}}} 9945 )::JSON AS json_output 9946 FROM 9947 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 9948 WHERE "{transcripts_column_id}" IS NOT NULL 9949 ) 9950 GROUP BY "#CHROM", POS, REF, ALT 9951 ) AS t 9952 WHERE {table_variants}."#CHROM" = t."#CHROM" 9953 AND {table_variants}."POS" = t."POS" 9954 AND {table_variants}."REF" = t."REF" 9955 AND {table_variants}."ALT" = t."ALT" 9956 """ 9957 9958 self.execute_query(query=query_update) 9959 9960 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None
The function prints the input, output, config, and dataframe of the current object
527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config
It returns the config
Returns
The config variable is being returned.
950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param
It returns the param
Returns
The param variable is being returned.
957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn
It returns the connection object
Returns
The connection object.
1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if fields_search != [field]: 1466 fields_search = sorted( 1467 list(set(fields_search).difference(fields_input)) 1468 ) 1469 1470 # If field is not in header (avoid not well formatted header) 1471 if not fields_search and not remove_fields_not_in_header: 1472 fields_search = [field] 1473 1474 # Add found fields 1475 for new_field in fields_search: 1476 # Add field, if not already exists, and if it is in header (if asked) 1477 if ( 1478 new_field not in fields_output 1479 and ( 1480 not remove_fields_not_in_header 1481 or new_field in fields_in_header 1482 ) 1483 and new_field not in [".*"] 1484 ): 1485 fields_output.append(new_field) 1486 1487 return fields_output 1488 1489 else: 1490 1491 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1493 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1494 """ 1495 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1496 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1497 not provided. 1498 1499 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1500 prefix to be used for exploding or expanding information 1501 :type explode_infos_prefix: str 1502 :return: the value of the variable `explode_infos_prefix`. 1503 """ 1504 1505 if not explode_infos_prefix: 1506 explode_infos_prefix = ( 1507 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1508 ) 1509 1510 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1512 def add_column( 1513 self, 1514 table_name, 1515 column_name, 1516 column_type, 1517 default_value=None, 1518 drop: bool = False, 1519 ) -> dict: 1520 """ 1521 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1522 doesn't already exist. 1523 1524 :param table_name: The name of the table to which you want to add a column 1525 :param column_name: The parameter "column_name" is the name of the column that you want to add 1526 to the table 1527 :param column_type: The `column_type` parameter specifies the data type of the column that you 1528 want to add to the table. It should be a string that represents the desired data type, such as 1529 "INTEGER", "TEXT", "REAL", etc 1530 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1531 default value for the newly added column. If a default value is provided, it will be assigned to 1532 the column for any existing rows that do not have a value for that column 1533 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1534 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1535 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1536 to False 1537 :type drop: bool (optional) 1538 :return: a boolean value indicating whether the column was successfully added to the table. 1539 """ 1540 1541 # added 1542 added = False 1543 dropped = False 1544 1545 # Check if the column already exists in the table 1546 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1547 columns = self.get_query_to_df(query).columns.tolist() 1548 if column_name in columns: 1549 log.debug( 1550 f"The {column_name} column already exists in the {table_name} table" 1551 ) 1552 if drop: 1553 self.drop_column(table_name=table_name, column_name=column_name) 1554 dropped = True 1555 else: 1556 return None 1557 else: 1558 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1559 1560 # Add column in table 1561 add_column_query = ( 1562 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1563 ) 1564 if default_value is not None: 1565 add_column_query += f" DEFAULT {default_value}" 1566 self.execute_query(add_column_query) 1567 added = not dropped 1568 log.debug( 1569 f"The {column_name} column was successfully added to the {table_name} table" 1570 ) 1571 1572 if added: 1573 added_column = { 1574 "table_name": table_name, 1575 "column_name": column_name, 1576 "column_type": column_type, 1577 "default_value": default_value, 1578 } 1579 else: 1580 added_column = None 1581 1582 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1584 def drop_column( 1585 self, column: dict = None, table_name: str = None, column_name: str = None 1586 ) -> bool: 1587 """ 1588 The `drop_column` function drops a specified column from a given table in a database and returns 1589 True if the column was successfully dropped, and False if the column does not exist in the 1590 table. 1591 1592 :param column: The `column` parameter is a dictionary that contains information about the column 1593 you want to drop. It has two keys: 1594 :type column: dict 1595 :param table_name: The `table_name` parameter is the name of the table from which you want to 1596 drop a column 1597 :type table_name: str 1598 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1599 from the table 1600 :type column_name: str 1601 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1602 and False if the column does not exist in the table. 1603 """ 1604 1605 # Find column infos 1606 if column: 1607 if isinstance(column, dict): 1608 table_name = column.get("table_name", None) 1609 column_name = column.get("column_name", None) 1610 elif isinstance(column, str): 1611 table_name = self.get_table_variants() 1612 column_name = column 1613 else: 1614 table_name = None 1615 column_name = None 1616 1617 if not table_name and not column_name: 1618 return False 1619 1620 # Removed 1621 removed = False 1622 1623 # Check if the column already exists in the table 1624 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1625 columns = self.get_query_to_df(query).columns.tolist() 1626 if column_name in columns: 1627 log.debug(f"The {column_name} column exists in the {table_name} table") 1628 else: 1629 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1630 return False 1631 1632 # Add column in table # ALTER TABLE integers DROP k 1633 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1634 self.execute_query(add_column_query) 1635 removed = True 1636 log.debug( 1637 f"The {column_name} column was successfully dropped to the {table_name} table" 1638 ) 1639 1640 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1642 def explode_infos( 1643 self, 1644 prefix: str = None, 1645 create_index: bool = False, 1646 fields: list = None, 1647 force: bool = False, 1648 proccess_all_fields_together: bool = False, 1649 ) -> list: 1650 """ 1651 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1652 columns, returning a list of added columns. 1653 1654 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1655 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1656 `self.get_explode_infos_prefix()` as the prefix 1657 :type prefix: str 1658 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1659 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1660 `False`, indexes will not be created. The default value is `False`, defaults to False 1661 :type create_index: bool (optional) 1662 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1663 individual columns. If this parameter is not provided, all INFO fields will be exploded 1664 :type fields: list 1665 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1666 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1667 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1668 defaults to False 1669 :type force: bool (optional) 1670 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1671 flag that determines whether to process all the INFO fields together or individually. If set to 1672 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1673 be processed individually, defaults to False 1674 :type proccess_all_fields_together: bool (optional) 1675 :return: The function `explode_infos` returns a list of added columns. 1676 """ 1677 1678 # drop indexes 1679 self.drop_indexes() 1680 1681 # connexion format 1682 connexion_format = self.get_connexion_format() 1683 1684 # Access 1685 access = self.get_config().get("access", None) 1686 1687 # Added columns 1688 added_columns = [] 1689 1690 if access not in ["RO"]: 1691 1692 # prefix 1693 if prefix in [None, True] or not isinstance(prefix, str): 1694 if self.get_explode_infos_prefix() not in [None, True]: 1695 prefix = self.get_explode_infos_prefix() 1696 else: 1697 prefix = "INFO/" 1698 1699 # table variants 1700 table_variants = self.get_table_variants(clause="select") 1701 1702 # extra infos 1703 try: 1704 extra_infos = self.get_extra_infos() 1705 except: 1706 extra_infos = [] 1707 1708 # Header infos 1709 header_infos = self.get_header().infos 1710 1711 log.debug( 1712 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1713 ) 1714 1715 sql_info_alter_table_array = [] 1716 1717 # Info fields to check 1718 fields_list = list(header_infos) 1719 if fields: 1720 fields_list += fields 1721 fields_list = set(fields_list) 1722 1723 # If no fields 1724 if not fields: 1725 fields = [] 1726 1727 # Translate fields if patterns 1728 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1729 1730 for info in fields: 1731 1732 info_id_sql = prefix + info 1733 1734 if ( 1735 info in fields_list 1736 or prefix + info in fields_list 1737 or info in extra_infos 1738 ): 1739 1740 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1741 1742 if info in header_infos: 1743 info_type = header_infos[info].type 1744 info_num = header_infos[info].num 1745 else: 1746 info_type = "String" 1747 info_num = 0 1748 1749 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1750 if info_num != 1: 1751 type_sql = "VARCHAR" 1752 1753 # Add field 1754 added_column = self.add_column( 1755 table_name=table_variants, 1756 column_name=info_id_sql, 1757 column_type=type_sql, 1758 default_value="null", 1759 drop=force, 1760 ) 1761 1762 if added_column: 1763 added_columns.append(added_column) 1764 1765 if added_column or force: 1766 1767 # add field to index 1768 self.index_additionnal_fields.append(info_id_sql) 1769 1770 # Update field array 1771 if connexion_format in ["duckdb"]: 1772 update_info_field = f""" 1773 "{info_id_sql}" = 1774 CASE 1775 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1776 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1777 END 1778 """ 1779 elif connexion_format in ["sqlite"]: 1780 update_info_field = f""" 1781 "{info_id_sql}" = 1782 CASE 1783 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1784 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1785 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1786 END 1787 """ 1788 1789 sql_info_alter_table_array.append(update_info_field) 1790 1791 if sql_info_alter_table_array: 1792 1793 # By chromosomes 1794 try: 1795 chromosomes_list = list( 1796 self.get_query_to_df( 1797 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1798 )["#CHROM"] 1799 ) 1800 except: 1801 chromosomes_list = [None] 1802 1803 for chrom in chromosomes_list: 1804 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1805 1806 # Where clause 1807 where_clause = "" 1808 if chrom and len(chromosomes_list) > 1: 1809 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1810 1811 # Update table 1812 if proccess_all_fields_together: 1813 sql_info_alter_table_array_join = ", ".join( 1814 sql_info_alter_table_array 1815 ) 1816 if sql_info_alter_table_array_join: 1817 sql_info_alter_table = f""" 1818 UPDATE {table_variants} 1819 SET {sql_info_alter_table_array_join} 1820 {where_clause} 1821 """ 1822 log.debug( 1823 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1824 ) 1825 # log.debug(sql_info_alter_table) 1826 self.conn.execute(sql_info_alter_table) 1827 else: 1828 sql_info_alter_num = 0 1829 for sql_info_alter in sql_info_alter_table_array: 1830 sql_info_alter_num += 1 1831 sql_info_alter_table = f""" 1832 UPDATE {table_variants} 1833 SET {sql_info_alter} 1834 {where_clause} 1835 """ 1836 log.debug( 1837 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1838 ) 1839 # log.debug(sql_info_alter_table) 1840 self.conn.execute(sql_info_alter_table) 1841 1842 # create indexes 1843 if create_index: 1844 self.create_indexes() 1845 1846 return added_columns
The explode_infos function takes a VCF file and explodes the INFO fields into individual
columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded - force: The
forceparameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set toFalse, the column will not be dropped, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually, defaults to False
Returns
The function
explode_infosreturns a list of added columns.
1848 def create_indexes(self) -> None: 1849 """ 1850 Create indexes on the table after insertion 1851 """ 1852 1853 # Access 1854 access = self.get_config().get("access", None) 1855 1856 # get table variants 1857 table_variants = self.get_table_variants("FROM") 1858 1859 if self.get_indexing() and access not in ["RO"]: 1860 # Create index 1861 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1862 self.conn.execute(sql_create_table_index) 1863 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1864 self.conn.execute(sql_create_table_index) 1865 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1866 self.conn.execute(sql_create_table_index) 1867 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1868 self.conn.execute(sql_create_table_index) 1869 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1870 self.conn.execute(sql_create_table_index) 1871 for field in self.index_additionnal_fields: 1872 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1873 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1875 def drop_indexes(self) -> None: 1876 """ 1877 Create indexes on the table after insertion 1878 """ 1879 1880 # Access 1881 access = self.get_config().get("access", None) 1882 1883 # get table variants 1884 table_variants = self.get_table_variants("FROM") 1885 1886 # Get database format 1887 connexion_format = self.get_connexion_format() 1888 1889 if access not in ["RO"]: 1890 if connexion_format in ["duckdb"]: 1891 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1892 elif connexion_format in ["sqlite"]: 1893 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1894 1895 list_indexes = self.conn.execute(sql_list_indexes) 1896 index_names = [row[0] for row in list_indexes.fetchall()] 1897 for index in index_names: 1898 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1899 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
1901 def read_vcf_header(self, f) -> list: 1902 """ 1903 It reads the header of a VCF file and returns a list of the header lines 1904 1905 :param f: the file object 1906 :return: The header lines of the VCF file. 1907 """ 1908 1909 header_list = [] 1910 for line in f: 1911 header_list.append(line) 1912 if line.startswith("#CHROM"): 1913 break 1914 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
1916 def read_vcf_header_file(self, file: str = None) -> list: 1917 """ 1918 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1919 uncompressed files. 1920 1921 :param file: The `file` parameter is a string that represents the path to the VCF header file 1922 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1923 default to `None` 1924 :type file: str 1925 :return: The function `read_vcf_header_file` returns a list. 1926 """ 1927 1928 if self.get_input_compressed(input_file=file): 1929 with bgzf.open(file, "rt") as f: 1930 return self.read_vcf_header(f=f) 1931 else: 1932 with open(file, "rt") as f: 1933 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
1935 def execute_query(self, query: str): 1936 """ 1937 It takes a query as an argument, executes it, and returns the results 1938 1939 :param query: The query to be executed 1940 :return: The result of the query is being returned. 1941 """ 1942 if query: 1943 return self.conn.execute(query) # .fetchall() 1944 else: 1945 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
1947 def export_output( 1948 self, 1949 output_file: str | None = None, 1950 output_header: str | None = None, 1951 export_header: bool = True, 1952 query: str | None = None, 1953 parquet_partitions: list | None = None, 1954 chunk_size: int | None = None, 1955 threads: int | None = None, 1956 sort: bool = False, 1957 index: bool = False, 1958 order_by: str | None = None, 1959 ) -> bool: 1960 """ 1961 The `export_output` function exports data from a VCF file to a specified output file in various 1962 formats, including VCF, CSV, TSV, PSV, and Parquet. 1963 1964 :param output_file: The `output_file` parameter is a string that specifies the name of the 1965 output file to be generated by the function. This is where the exported data will be saved 1966 :type output_file: str 1967 :param output_header: The `output_header` parameter is a string that specifies the name of the 1968 file where the header of the VCF file will be exported. If this parameter is not provided, the 1969 header will be exported to a file with the same name as the `output_file` parameter, but with 1970 the extension " 1971 :type output_header: str 1972 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1973 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1974 True, the header will be exported to a file. If `export_header` is False, the header will not 1975 be, defaults to True, if output format is not VCF 1976 :type export_header: bool (optional) 1977 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1978 select specific data from the VCF file before exporting it. If provided, only the data that 1979 matches the query will be exported 1980 :type query: str 1981 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1982 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1983 organize data in a hierarchical directory structure based on the values of one or more columns. 1984 This can improve query performance when working with large datasets 1985 :type parquet_partitions: list 1986 :param chunk_size: The `chunk_size` parameter specifies the number of 1987 records in batch when exporting data in Parquet format. This parameter is used for 1988 partitioning the Parquet file into multiple files. 1989 :type chunk_size: int 1990 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1991 threads to be used during the export process. It determines the level of parallelism and can 1992 improve the performance of the export operation. If not provided, the function will use the 1993 default number of threads 1994 :type threads: int 1995 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1996 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1997 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1998 False 1999 :type sort: bool (optional) 2000 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2001 created on the output file. If `index` is True, an index will be created. If `index` is False, 2002 no index will be created. The default value is False, defaults to False 2003 :type index: bool (optional) 2004 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2005 sorting the output file. This parameter is only applicable when exporting data in VCF format 2006 :type order_by: str 2007 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2008 None if it doesn't. 2009 """ 2010 2011 # Log 2012 log.info("Exporting...") 2013 2014 # Full path 2015 output_file = full_path(output_file) 2016 output_header = full_path(output_header) 2017 2018 # Config 2019 config = self.get_config() 2020 2021 # Param 2022 param = self.get_param() 2023 2024 # Tmp files to remove 2025 tmp_to_remove = [] 2026 2027 # If no output, get it 2028 if not output_file: 2029 output_file = self.get_output() 2030 2031 # If not threads 2032 if not threads: 2033 threads = self.get_threads() 2034 2035 # Auto header name with extension 2036 if export_header or output_header: 2037 if not output_header: 2038 output_header = f"{output_file}.hdr" 2039 # Export header 2040 self.export_header(output_file=output_file) 2041 2042 # Switch off export header if VCF output 2043 output_file_type = get_file_format(output_file) 2044 if output_file_type in ["vcf"]: 2045 export_header = False 2046 tmp_to_remove.append(output_header) 2047 2048 # Chunk size 2049 if not chunk_size: 2050 chunk_size = config.get("chunk_size", None) 2051 2052 # Parquet partition 2053 if not parquet_partitions: 2054 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2055 if parquet_partitions and isinstance(parquet_partitions, str): 2056 parquet_partitions = parquet_partitions.split(",") 2057 2058 # Order by 2059 if not order_by: 2060 order_by = param.get("export", {}).get("order_by", "") 2061 2062 # Header in output 2063 header_in_output = param.get("export", {}).get("include_header", False) 2064 2065 # Database 2066 database_source = self.get_connexion() 2067 2068 # Connexion format 2069 connexion_format = self.get_connexion_format() 2070 2071 # Explode infos 2072 if self.get_explode_infos(): 2073 self.explode_infos( 2074 prefix=self.get_explode_infos_prefix(), 2075 fields=self.get_explode_infos_fields(), 2076 force=False, 2077 ) 2078 2079 # if connexion_format in ["sqlite"] or query: 2080 if connexion_format in ["sqlite"]: 2081 2082 # Export in Parquet 2083 random_tmp = "".join( 2084 random.choice(string.ascii_lowercase) for i in range(10) 2085 ) 2086 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2087 tmp_to_remove.append(database_source) 2088 2089 # Table Variants 2090 table_variants = self.get_table_variants() 2091 2092 # Create export query 2093 sql_query_export_subquery = f""" 2094 SELECT * FROM {table_variants} 2095 """ 2096 2097 # Write source file 2098 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2099 2100 # Create database 2101 database = Database( 2102 database=database_source, 2103 table="variants", 2104 header_file=output_header, 2105 conn_config=self.get_connexion_config(), 2106 ) 2107 2108 # Existing colomns header 2109 # existing_columns_header = database.get_header_file_columns(output_header) 2110 existing_columns_header = database.get_header_columns_from_database() 2111 2112 # Export file 2113 database.export( 2114 output_database=output_file, 2115 output_header=output_header, 2116 existing_columns_header=existing_columns_header, 2117 parquet_partitions=parquet_partitions, 2118 chunk_size=chunk_size, 2119 threads=threads, 2120 sort=sort, 2121 index=index, 2122 header_in_output=header_in_output, 2123 order_by=order_by, 2124 query=query, 2125 export_header=export_header, 2126 ) 2127 2128 # Remove 2129 remove_if_exists(tmp_to_remove) 2130 2131 return (os.path.exists(output_file) or None) and ( 2132 os.path.exists(output_file) or None 2133 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2135 def get_extra_infos(self, table: str = None) -> list: 2136 """ 2137 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2138 in the header. 2139 2140 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2141 name of the table from which you want to retrieve the extra columns that are not present in the 2142 header. If the `table` parameter is not provided when calling the function, it will default to 2143 using the variants 2144 :type table: str 2145 :return: A list of columns that are in the specified table but not in the header of the table. 2146 """ 2147 2148 header_columns = [] 2149 2150 if not table: 2151 table = self.get_table_variants(clause="from") 2152 header_columns = self.get_header_columns() 2153 2154 # Check all columns in the database 2155 query = f""" SELECT * FROM {table} LIMIT 1 """ 2156 log.debug(f"query {query}") 2157 table_columns = self.get_query_to_df(query).columns.tolist() 2158 extra_columns = [] 2159 2160 # Construct extra infos (not in header) 2161 for column in table_columns: 2162 if column not in header_columns: 2163 extra_columns.append(column) 2164 2165 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2167 def get_extra_infos_sql(self, table: str = None) -> str: 2168 """ 2169 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2170 by double quotes 2171 2172 :param table: The name of the table to get the extra infos from. If None, the default table is 2173 used 2174 :type table: str 2175 :return: A string of the extra infos 2176 """ 2177 2178 return ", ".join( 2179 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2180 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2182 def export_header( 2183 self, 2184 header_name: str = None, 2185 output_file: str = None, 2186 output_file_ext: str = ".hdr", 2187 clean_header: bool = True, 2188 remove_chrom_line: bool = False, 2189 ) -> str: 2190 """ 2191 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2192 specified options, and writes it to a new file. 2193 2194 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2195 this parameter is not specified, the header will be written to the output file 2196 :type header_name: str 2197 :param output_file: The `output_file` parameter in the `export_header` function is used to 2198 specify the name of the output file where the header will be written. If this parameter is not 2199 provided, the header will be written to a temporary file 2200 :type output_file: str 2201 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2202 string that represents the extension of the output header file. By default, it is set to ".hdr" 2203 if not specified by the user. This extension will be appended to the `output_file` name to 2204 create the final, defaults to .hdr 2205 :type output_file_ext: str (optional) 2206 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2207 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2208 `True`, the function will clean the header by modifying certain lines based on a specific 2209 pattern. If `clean_header`, defaults to True 2210 :type clean_header: bool (optional) 2211 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2212 boolean flag that determines whether the #CHROM line should be removed from the header before 2213 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2214 defaults to False 2215 :type remove_chrom_line: bool (optional) 2216 :return: The function `export_header` returns the name of the temporary header file that is 2217 created. 2218 """ 2219 2220 if not header_name and not output_file: 2221 output_file = self.get_output() 2222 2223 if self.get_header(): 2224 2225 # Get header object 2226 header_obj = self.get_header() 2227 2228 # Create database 2229 db_for_header = Database(database=self.get_input()) 2230 2231 # Get real columns in the file 2232 db_header_columns = db_for_header.get_columns() 2233 2234 with tempfile.TemporaryDirectory() as tmpdir: 2235 2236 # Write header file 2237 header_file_tmp = os.path.join(tmpdir, "header") 2238 f = open(header_file_tmp, "w") 2239 vcf.Writer(f, header_obj) 2240 f.close() 2241 2242 # Replace #CHROM line with rel columns 2243 header_list = db_for_header.read_header_file( 2244 header_file=header_file_tmp 2245 ) 2246 header_list[-1] = "\t".join(db_header_columns) 2247 2248 # Remove CHROM line 2249 if remove_chrom_line: 2250 header_list.pop() 2251 2252 # Clean header 2253 if clean_header: 2254 header_list_clean = [] 2255 for head in header_list: 2256 # Clean head for malformed header 2257 head_clean = head 2258 head_clean = re.subn( 2259 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2260 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2261 head_clean, 2262 2, 2263 )[0] 2264 # Write header 2265 header_list_clean.append(head_clean) 2266 header_list = header_list_clean 2267 2268 tmp_header_name = output_file + output_file_ext 2269 2270 f = open(tmp_header_name, "w") 2271 for line in header_list: 2272 f.write(line) 2273 f.close() 2274 2275 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2277 def export_variant_vcf( 2278 self, 2279 vcf_file, 2280 remove_info: bool = False, 2281 add_samples: bool = True, 2282 list_samples: list = [], 2283 where_clause: str = "", 2284 index: bool = False, 2285 threads: int | None = None, 2286 ) -> bool | None: 2287 """ 2288 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2289 remove INFO field, add samples, and control compression and indexing. 2290 2291 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2292 written to. It is the output file that will contain the filtered VCF data based on the specified 2293 parameters 2294 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2295 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2296 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2297 in, defaults to False 2298 :type remove_info: bool (optional) 2299 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2300 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2301 If set to False, the samples will be removed. The default value is True, defaults to True 2302 :type add_samples: bool (optional) 2303 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2304 in the output VCF file. By default, all samples will be included. If you provide a list of 2305 samples, only those samples will be included in the output file 2306 :type list_samples: list 2307 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2308 determines whether or not to create an index for the output VCF file. If `index` is set to 2309 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2310 :type index: bool (optional) 2311 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2312 number of threads to use for exporting the VCF file. It determines how many parallel threads 2313 will be used during the export process. More threads can potentially speed up the export process 2314 by utilizing multiple cores of the processor. If 2315 :type threads: int | None 2316 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2317 method with various parameters including the output file, query, threads, sort flag, and index 2318 flag. The `export_output` method is responsible for exporting the VCF data based on the 2319 specified parameters and configurations provided in the `export_variant_vcf` function. 2320 """ 2321 2322 # Config 2323 config = self.get_config() 2324 2325 # Extract VCF 2326 log.debug("Export VCF...") 2327 2328 # Table variants 2329 table_variants = self.get_table_variants() 2330 2331 # Threads 2332 if not threads: 2333 threads = self.get_threads() 2334 2335 # Info fields 2336 if remove_info: 2337 if not isinstance(remove_info, str): 2338 remove_info = "." 2339 info_field = f"""'{remove_info}' as INFO""" 2340 else: 2341 info_field = "INFO" 2342 2343 # Samples fields 2344 if add_samples: 2345 if not list_samples: 2346 list_samples = self.get_header_sample_list() 2347 if list_samples: 2348 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2349 else: 2350 samples_fields = "" 2351 log.debug(f"samples_fields: {samples_fields}") 2352 else: 2353 samples_fields = "" 2354 2355 # Where clause 2356 if where_clause is None: 2357 where_clause = "" 2358 2359 # Variants 2360 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2361 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2362 log.debug(f"sql_query_select={sql_query_select}") 2363 2364 return self.export_output( 2365 output_file=vcf_file, 2366 output_header=None, 2367 export_header=True, 2368 query=sql_query_select, 2369 parquet_partitions=None, 2370 chunk_size=config.get("chunk_size", None), 2371 threads=threads, 2372 sort=True, 2373 index=index, 2374 order_by=None, 2375 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2377 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2378 """ 2379 It takes a list of commands and runs them in parallel using the number of threads specified 2380 2381 :param commands: A list of commands to run 2382 :param threads: The number of threads to use, defaults to 1 (optional) 2383 """ 2384 2385 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2387 def get_threads(self, default: int = 1) -> int: 2388 """ 2389 This function returns the number of threads to use for a job, with a default value of 1 if not 2390 specified. 2391 2392 :param default: The `default` parameter in the `get_threads` method is used to specify the 2393 default number of threads to use if no specific value is provided. If no value is provided for 2394 the `threads` parameter in the configuration or input parameters, the `default` value will be 2395 used, defaults to 1 2396 :type default: int (optional) 2397 :return: the number of threads to use for the current job. 2398 """ 2399 2400 # Config 2401 config = self.get_config() 2402 2403 # Param 2404 param = self.get_param() 2405 2406 # Input threads 2407 input_thread = param.get("threads", config.get("threads", None)) 2408 2409 # Check threads 2410 if not input_thread: 2411 threads = default 2412 elif int(input_thread) <= 0: 2413 threads = os.cpu_count() 2414 else: 2415 threads = int(input_thread) 2416 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2418 def get_memory(self, default: str = None) -> str: 2419 """ 2420 This function retrieves the memory value from parameters or configuration with a default value 2421 if not found. 2422 2423 :param default: The `get_memory` function takes in a default value as a string parameter. This 2424 default value is used as a fallback in case the `memory` parameter is not provided in the 2425 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2426 the function 2427 :type default: str 2428 :return: The `get_memory` function returns a string value representing the memory parameter. If 2429 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2430 return the default value provided as an argument to the function. 2431 """ 2432 2433 # Config 2434 config = self.get_config() 2435 2436 # Param 2437 param = self.get_param() 2438 2439 # Input threads 2440 input_memory = param.get("memory", config.get("memory", None)) 2441 2442 # Check threads 2443 if input_memory: 2444 memory = input_memory 2445 else: 2446 memory = default 2447 2448 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2450 def update_from_vcf(self, vcf_file: str) -> None: 2451 """ 2452 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2453 2454 :param vcf_file: the path to the VCF file 2455 """ 2456 2457 connexion_format = self.get_connexion_format() 2458 2459 if connexion_format in ["duckdb"]: 2460 self.update_from_vcf_duckdb(vcf_file) 2461 elif connexion_format in ["sqlite"]: 2462 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2464 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2465 """ 2466 It takes a VCF file and updates the INFO column of the variants table in the database with the 2467 INFO column of the VCF file 2468 2469 :param vcf_file: the path to the VCF file 2470 """ 2471 2472 # varaints table 2473 table_variants = self.get_table_variants() 2474 2475 # Loading VCF into temporaire table 2476 skip = self.get_header_length(file=vcf_file) 2477 vcf_df = pd.read_csv( 2478 vcf_file, 2479 sep="\t", 2480 engine="c", 2481 skiprows=skip, 2482 header=0, 2483 low_memory=False, 2484 ) 2485 sql_query_update = f""" 2486 UPDATE {table_variants} as table_variants 2487 SET INFO = concat( 2488 CASE 2489 WHEN INFO NOT IN ('', '.') 2490 THEN INFO 2491 ELSE '' 2492 END, 2493 ( 2494 SELECT 2495 concat( 2496 CASE 2497 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2498 THEN ';' 2499 ELSE '' 2500 END 2501 , 2502 CASE 2503 WHEN table_parquet.INFO NOT IN ('','.') 2504 THEN table_parquet.INFO 2505 ELSE '' 2506 END 2507 ) 2508 FROM vcf_df as table_parquet 2509 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2510 AND table_parquet.\"POS\" = table_variants.\"POS\" 2511 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2512 AND table_parquet.\"REF\" = table_variants.\"REF\" 2513 AND table_parquet.INFO NOT IN ('','.') 2514 ) 2515 ) 2516 ; 2517 """ 2518 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2520 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2521 """ 2522 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2523 table, then updates the INFO column of the variants table with the INFO column of the temporary 2524 table 2525 2526 :param vcf_file: The path to the VCF file you want to update the database with 2527 """ 2528 2529 # Create a temporary table for the VCF 2530 table_vcf = "tmp_vcf" 2531 sql_create = ( 2532 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2533 ) 2534 self.conn.execute(sql_create) 2535 2536 # Loading VCF into temporaire table 2537 vcf_df = pd.read_csv( 2538 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2539 ) 2540 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2541 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2542 2543 # Update table 'variants' with VCF data 2544 # warning: CONCAT as || operator 2545 sql_query_update = f""" 2546 UPDATE variants as table_variants 2547 SET INFO = CASE 2548 WHEN INFO NOT IN ('', '.') 2549 THEN INFO 2550 ELSE '' 2551 END || 2552 ( 2553 SELECT 2554 CASE 2555 WHEN table_variants.INFO NOT IN ('','.') 2556 AND table_vcf.INFO NOT IN ('','.') 2557 THEN ';' 2558 ELSE '' 2559 END || 2560 CASE 2561 WHEN table_vcf.INFO NOT IN ('','.') 2562 THEN table_vcf.INFO 2563 ELSE '' 2564 END 2565 FROM {table_vcf} as table_vcf 2566 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2567 AND table_vcf.\"POS\" = table_variants.\"POS\" 2568 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2569 AND table_vcf.\"REF\" = table_variants.\"REF\" 2570 ) 2571 """ 2572 self.conn.execute(sql_query_update) 2573 2574 # Drop temporary table 2575 sql_drop = f"DROP TABLE {table_vcf}" 2576 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2578 def drop_variants_table(self) -> None: 2579 """ 2580 > This function drops the variants table 2581 """ 2582 2583 table_variants = self.get_table_variants() 2584 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2585 self.conn.execute(sql_table_variants)
This function drops the variants table
2587 def set_variant_id( 2588 self, variant_id_column: str = "variant_id", force: bool = None 2589 ) -> str: 2590 """ 2591 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2592 `#CHROM`, `POS`, `REF`, and `ALT` columns 2593 2594 :param variant_id_column: The name of the column to be created in the variants table, defaults 2595 to variant_id 2596 :type variant_id_column: str (optional) 2597 :param force: If True, the variant_id column will be created even if it already exists 2598 :type force: bool 2599 :return: The name of the column that contains the variant_id 2600 """ 2601 2602 # Assembly 2603 assembly = self.get_param().get( 2604 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2605 ) 2606 2607 # INFO/Tag prefix 2608 prefix = self.get_explode_infos_prefix() 2609 2610 # Explode INFO/SVTYPE 2611 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2612 2613 # variants table 2614 table_variants = self.get_table_variants() 2615 2616 # variant_id column 2617 if not variant_id_column: 2618 variant_id_column = "variant_id" 2619 2620 # Creta variant_id column 2621 if "variant_id" not in self.get_extra_infos() or force: 2622 2623 # Create column 2624 self.add_column( 2625 table_name=table_variants, 2626 column_name=variant_id_column, 2627 column_type="UBIGINT", 2628 default_value="0", 2629 ) 2630 2631 # Update column 2632 self.conn.execute( 2633 f""" 2634 UPDATE {table_variants} 2635 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2636 """ 2637 ) 2638 2639 # Remove added columns 2640 for added_column in added_columns: 2641 self.drop_column(column=added_column) 2642 2643 # return variant_id column name 2644 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2646 def get_variant_id_column( 2647 self, variant_id_column: str = "variant_id", force: bool = None 2648 ) -> str: 2649 """ 2650 This function returns the variant_id column name 2651 2652 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2653 defaults to variant_id 2654 :type variant_id_column: str (optional) 2655 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2656 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2657 if it is not already set, or if it is set 2658 :type force: bool 2659 :return: The variant_id column name. 2660 """ 2661 2662 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2668 def scan_databases( 2669 self, 2670 database_formats: list = ["parquet"], 2671 database_releases: list = ["current"], 2672 ) -> dict: 2673 """ 2674 The function `scan_databases` scans for available databases based on specified formats and 2675 releases. 2676 2677 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2678 of the databases to be scanned. In this case, the accepted format is "parquet" 2679 :type database_formats: list ["parquet"] 2680 :param database_releases: The `database_releases` parameter is a list that specifies the 2681 releases of the databases to be scanned. In the provided function, the default value for 2682 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2683 databases that are in the "current" 2684 :type database_releases: list 2685 :return: The function `scan_databases` returns a dictionary containing information about 2686 databases that match the specified formats and releases. 2687 """ 2688 2689 # Config 2690 config = self.get_config() 2691 2692 # Param 2693 param = self.get_param() 2694 2695 # Param - Assembly 2696 assembly = param.get("assembly", config.get("assembly", None)) 2697 if not assembly: 2698 assembly = DEFAULT_ASSEMBLY 2699 log.warning(f"Default assembly '{assembly}'") 2700 2701 # Scan for availabled databases 2702 log.info( 2703 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2704 ) 2705 databases_infos_dict = databases_infos( 2706 database_folder_releases=database_releases, 2707 database_formats=database_formats, 2708 assembly=assembly, 2709 config=config, 2710 ) 2711 log.info( 2712 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2713 ) 2714 2715 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2717 def annotation(self) -> None: 2718 """ 2719 It annotates the VCF file with the annotations specified in the config file. 2720 """ 2721 2722 # Config 2723 config = self.get_config() 2724 2725 # Param 2726 param = self.get_param() 2727 2728 # Param - Assembly 2729 assembly = param.get("assembly", config.get("assembly", None)) 2730 if not assembly: 2731 assembly = DEFAULT_ASSEMBLY 2732 log.warning(f"Default assembly '{assembly}'") 2733 2734 # annotations databases folders 2735 annotations_databases = set( 2736 config.get("folders", {}) 2737 .get("databases", {}) 2738 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2739 + config.get("folders", {}) 2740 .get("databases", {}) 2741 .get("parquet", ["~/howard/databases/parquet/current"]) 2742 + config.get("folders", {}) 2743 .get("databases", {}) 2744 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2745 ) 2746 2747 # Get param annotations 2748 if param.get("annotations", None) and isinstance( 2749 param.get("annotations", None), str 2750 ): 2751 log.debug(param.get("annotations", None)) 2752 param_annotation_list = param.get("annotations").split(",") 2753 else: 2754 param_annotation_list = [] 2755 2756 # Each tools param 2757 if param.get("annotation_parquet", None) != None: 2758 log.debug( 2759 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2760 ) 2761 if isinstance(param.get("annotation_parquet", None), list): 2762 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2763 else: 2764 param_annotation_list.append(param.get("annotation_parquet")) 2765 if param.get("annotation_snpsift", None) != None: 2766 if isinstance(param.get("annotation_snpsift", None), list): 2767 param_annotation_list.append( 2768 "snpsift:" 2769 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2770 ) 2771 else: 2772 param_annotation_list.append( 2773 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2774 ) 2775 if param.get("annotation_snpeff", None) != None: 2776 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2777 if param.get("annotation_bcftools", None) != None: 2778 if isinstance(param.get("annotation_bcftools", None), list): 2779 param_annotation_list.append( 2780 "bcftools:" 2781 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2786 ) 2787 if param.get("annotation_annovar", None) != None: 2788 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2789 if param.get("annotation_exomiser", None) != None: 2790 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2791 if param.get("annotation_splice", None) != None: 2792 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2793 2794 # Merge param annotations list 2795 param["annotations"] = ",".join(param_annotation_list) 2796 2797 # debug 2798 log.debug(f"param_annotations={param['annotations']}") 2799 2800 if param.get("annotations"): 2801 2802 # Log 2803 # log.info("Annotations - Check annotation parameters") 2804 2805 if not "annotation" in param: 2806 param["annotation"] = {} 2807 2808 # List of annotations parameters 2809 annotations_list_input = {} 2810 if isinstance(param.get("annotations", None), str): 2811 annotation_file_list = [ 2812 value for value in param.get("annotations", "").split(",") 2813 ] 2814 for annotation_file in annotation_file_list: 2815 annotations_list_input[annotation_file] = {"INFO": None} 2816 else: 2817 annotations_list_input = param.get("annotations", {}) 2818 2819 log.info(f"Quick Annotations:") 2820 for annotation_key in list(annotations_list_input.keys()): 2821 log.info(f" {annotation_key}") 2822 2823 # List of annotations and associated fields 2824 annotations_list = {} 2825 2826 for annotation_file in annotations_list_input: 2827 2828 # Explode annotations if ALL 2829 if ( 2830 annotation_file.upper() == "ALL" 2831 or annotation_file.upper().startswith("ALL:") 2832 ): 2833 2834 # check ALL parameters (formats, releases) 2835 annotation_file_split = annotation_file.split(":") 2836 database_formats = "parquet" 2837 database_releases = "current" 2838 for annotation_file_option in annotation_file_split[1:]: 2839 database_all_options_split = annotation_file_option.split("=") 2840 if database_all_options_split[0] == "format": 2841 database_formats = database_all_options_split[1].split("+") 2842 if database_all_options_split[0] == "release": 2843 database_releases = database_all_options_split[1].split("+") 2844 2845 # Scan for availabled databases 2846 databases_infos_dict = self.scan_databases( 2847 database_formats=database_formats, 2848 database_releases=database_releases, 2849 ) 2850 2851 # Add found databases in annotation parameters 2852 for database_infos in databases_infos_dict.keys(): 2853 annotations_list[database_infos] = {"INFO": None} 2854 2855 else: 2856 annotations_list[annotation_file] = annotations_list_input[ 2857 annotation_file 2858 ] 2859 2860 # Check each databases 2861 if len(annotations_list): 2862 2863 log.info( 2864 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2865 ) 2866 2867 for annotation_file in annotations_list: 2868 2869 # Init 2870 annotations = annotations_list.get(annotation_file, None) 2871 2872 # Annotation snpEff 2873 if annotation_file.startswith("snpeff"): 2874 2875 log.debug(f"Quick Annotation snpEff") 2876 2877 if "snpeff" not in param["annotation"]: 2878 param["annotation"]["snpeff"] = {} 2879 2880 if "options" not in param["annotation"]["snpeff"]: 2881 param["annotation"]["snpeff"]["options"] = "" 2882 2883 # snpEff options in annotations 2884 param["annotation"]["snpeff"]["options"] = "".join( 2885 annotation_file.split(":")[1:] 2886 ) 2887 2888 # Annotation Annovar 2889 elif annotation_file.startswith("annovar"): 2890 2891 log.debug(f"Quick Annotation Annovar") 2892 2893 if "annovar" not in param["annotation"]: 2894 param["annotation"]["annovar"] = {} 2895 2896 if "annotations" not in param["annotation"]["annovar"]: 2897 param["annotation"]["annovar"]["annotations"] = {} 2898 2899 # Options 2900 annotation_file_split = annotation_file.split(":") 2901 for annotation_file_annotation in annotation_file_split[1:]: 2902 if annotation_file_annotation: 2903 param["annotation"]["annovar"]["annotations"][ 2904 annotation_file_annotation 2905 ] = annotations 2906 2907 # Annotation Exomiser 2908 elif annotation_file.startswith("exomiser"): 2909 2910 log.debug(f"Quick Annotation Exomiser") 2911 2912 param["annotation"]["exomiser"] = params_string_to_dict( 2913 annotation_file 2914 ) 2915 2916 # Annotation Splice 2917 elif annotation_file.startswith("splice"): 2918 2919 log.debug(f"Quick Annotation Splice") 2920 2921 param["annotation"]["splice"] = params_string_to_dict( 2922 annotation_file 2923 ) 2924 2925 # Annotation Parquet or BCFTOOLS 2926 else: 2927 2928 # Tools detection 2929 if annotation_file.startswith("bcftools:"): 2930 annotation_tool_initial = "bcftools" 2931 annotation_file = ":".join(annotation_file.split(":")[1:]) 2932 elif annotation_file.startswith("snpsift:"): 2933 annotation_tool_initial = "snpsift" 2934 annotation_file = ":".join(annotation_file.split(":")[1:]) 2935 else: 2936 annotation_tool_initial = None 2937 2938 # list of files 2939 annotation_file_list = annotation_file.replace("+", ":").split( 2940 ":" 2941 ) 2942 2943 for annotation_file in annotation_file_list: 2944 2945 if annotation_file: 2946 2947 # Annotation tool initial 2948 annotation_tool = annotation_tool_initial 2949 2950 # Find file 2951 annotation_file_found = None 2952 2953 # Expand user 2954 annotation_file = full_path(annotation_file) 2955 2956 if os.path.exists(annotation_file): 2957 annotation_file_found = annotation_file 2958 2959 else: 2960 # Find within assembly folders 2961 for annotations_database in annotations_databases: 2962 found_files = find_all( 2963 annotation_file, 2964 os.path.join( 2965 annotations_database, assembly 2966 ), 2967 ) 2968 if len(found_files) > 0: 2969 annotation_file_found = found_files[0] 2970 break 2971 if not annotation_file_found and not assembly: 2972 # Find within folders 2973 for ( 2974 annotations_database 2975 ) in annotations_databases: 2976 found_files = find_all( 2977 annotation_file, annotations_database 2978 ) 2979 if len(found_files) > 0: 2980 annotation_file_found = found_files[0] 2981 break 2982 log.debug( 2983 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2984 ) 2985 2986 # Full path 2987 annotation_file_found = full_path(annotation_file_found) 2988 2989 if annotation_file_found: 2990 2991 database = Database(database=annotation_file_found) 2992 quick_annotation_format = database.get_format() 2993 quick_annotation_is_compressed = ( 2994 database.is_compressed() 2995 ) 2996 quick_annotation_is_indexed = os.path.exists( 2997 f"{annotation_file_found}.tbi" 2998 ) 2999 bcftools_preference = False 3000 3001 # Check Annotation Tool 3002 if not annotation_tool: 3003 if ( 3004 bcftools_preference 3005 and quick_annotation_format 3006 in ["vcf", "bed"] 3007 and quick_annotation_is_compressed 3008 and quick_annotation_is_indexed 3009 ): 3010 annotation_tool = "bcftools" 3011 elif quick_annotation_format in [ 3012 "vcf", 3013 "bed", 3014 "tsv", 3015 "tsv", 3016 "csv", 3017 "json", 3018 "tbl", 3019 "parquet", 3020 "duckdb", 3021 ]: 3022 annotation_tool = "parquet" 3023 else: 3024 log.error( 3025 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3026 ) 3027 raise ValueError( 3028 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3029 ) 3030 3031 log.debug( 3032 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3033 ) 3034 3035 # Annotation Tool dispatch 3036 if annotation_tool: 3037 if annotation_tool not in param["annotation"]: 3038 param["annotation"][annotation_tool] = {} 3039 if ( 3040 "annotations" 3041 not in param["annotation"][annotation_tool] 3042 ): 3043 param["annotation"][annotation_tool][ 3044 "annotations" 3045 ] = {} 3046 param["annotation"][annotation_tool][ 3047 "annotations" 3048 ][annotation_file_found] = annotations 3049 3050 else: 3051 log.error( 3052 f"Quick Annotation File {annotation_file} does NOT exist" 3053 ) 3054 3055 self.set_param(param) 3056 3057 if param.get("annotation", None): 3058 log.info("Annotations") 3059 if param.get("annotation", {}).get("parquet", None): 3060 log.info("Annotations 'parquet'...") 3061 self.annotation_parquet() 3062 if param.get("annotation", {}).get("bcftools", None): 3063 log.info("Annotations 'bcftools'...") 3064 self.annotation_bcftools() 3065 if param.get("annotation", {}).get("snpsift", None): 3066 log.info("Annotations 'snpsift'...") 3067 self.annotation_snpsift() 3068 if param.get("annotation", {}).get("annovar", None): 3069 log.info("Annotations 'annovar'...") 3070 self.annotation_annovar() 3071 if param.get("annotation", {}).get("snpeff", None): 3072 log.info("Annotations 'snpeff'...") 3073 self.annotation_snpeff() 3074 if param.get("annotation", {}).get("exomiser", None) is not None: 3075 log.info("Annotations 'exomiser'...") 3076 self.annotation_exomiser() 3077 if param.get("annotation", {}).get("splice", None) is not None: 3078 log.info("Annotations 'splice' ...") 3079 self.annotation_splice() 3080 3081 # Explode INFOS fields into table fields 3082 if self.get_explode_infos(): 3083 self.explode_infos( 3084 prefix=self.get_explode_infos_prefix(), 3085 fields=self.get_explode_infos_fields(), 3086 force=True, 3087 )
It annotates the VCF file with the annotations specified in the config file.
3089 def annotation_snpsift(self, threads: int = None) -> None: 3090 """ 3091 This function annotate with bcftools 3092 3093 :param threads: Number of threads to use 3094 :return: the value of the variable "return_value". 3095 """ 3096 3097 # DEBUG 3098 log.debug("Start annotation with bcftools databases") 3099 3100 # Threads 3101 if not threads: 3102 threads = self.get_threads() 3103 log.debug("Threads: " + str(threads)) 3104 3105 # Config 3106 config = self.get_config() 3107 log.debug("Config: " + str(config)) 3108 3109 # Config - snpSift 3110 snpsift_bin_command = get_bin_command( 3111 bin="SnpSift.jar", 3112 tool="snpsift", 3113 bin_type="jar", 3114 config=config, 3115 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3116 ) 3117 if not snpsift_bin_command: 3118 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3119 log.error(msg_err) 3120 raise ValueError(msg_err) 3121 3122 # Config - bcftools 3123 bcftools_bin_command = get_bin_command( 3124 bin="bcftools", 3125 tool="bcftools", 3126 bin_type="bin", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3129 ) 3130 if not bcftools_bin_command: 3131 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - BCFTools databases folders 3136 databases_folders = set( 3137 self.get_config() 3138 .get("folders", {}) 3139 .get("databases", {}) 3140 .get("annotations", ["."]) 3141 + self.get_config() 3142 .get("folders", {}) 3143 .get("databases", {}) 3144 .get("bcftools", ["."]) 3145 ) 3146 log.debug("Databases annotations: " + str(databases_folders)) 3147 3148 # Param 3149 annotations = ( 3150 self.get_param() 3151 .get("annotation", {}) 3152 .get("snpsift", {}) 3153 .get("annotations", None) 3154 ) 3155 log.debug("Annotations: " + str(annotations)) 3156 3157 # Assembly 3158 assembly = self.get_param().get( 3159 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3160 ) 3161 3162 # Data 3163 table_variants = self.get_table_variants() 3164 3165 # Check if not empty 3166 log.debug("Check if not empty") 3167 sql_query_chromosomes = ( 3168 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3169 ) 3170 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3171 if not sql_query_chromosomes_df["count"][0]: 3172 log.info(f"VCF empty") 3173 return 3174 3175 # VCF header 3176 vcf_reader = self.get_header() 3177 log.debug("Initial header: " + str(vcf_reader.infos)) 3178 3179 # Existing annotations 3180 for vcf_annotation in self.get_header().infos: 3181 3182 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3183 log.debug( 3184 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3185 ) 3186 3187 if annotations: 3188 3189 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3190 3191 # Export VCF file 3192 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3193 3194 # Init 3195 commands = {} 3196 3197 for annotation in annotations: 3198 annotation_fields = annotations[annotation] 3199 3200 # Annotation Name 3201 annotation_name = os.path.basename(annotation) 3202 3203 if not annotation_fields: 3204 annotation_fields = {"INFO": None} 3205 3206 log.debug(f"Annotation '{annotation_name}'") 3207 log.debug( 3208 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3209 ) 3210 3211 # Create Database 3212 database = Database( 3213 database=annotation, 3214 databases_folders=databases_folders, 3215 assembly=assembly, 3216 ) 3217 3218 # Find files 3219 db_file = database.get_database() 3220 db_file = full_path(db_file) 3221 db_hdr_file = database.get_header_file() 3222 db_hdr_file = full_path(db_hdr_file) 3223 db_file_type = database.get_format() 3224 db_tbi_file = f"{db_file}.tbi" 3225 db_file_compressed = database.is_compressed() 3226 3227 # Check if compressed 3228 if not db_file_compressed: 3229 log.error( 3230 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3231 ) 3232 raise ValueError( 3233 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3234 ) 3235 3236 # Check if indexed 3237 if not os.path.exists(db_tbi_file): 3238 log.error( 3239 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3240 ) 3241 raise ValueError( 3242 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3243 ) 3244 3245 # Check index - try to create if not exists 3246 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3247 log.error("Annotation failed: database not valid") 3248 log.error(f"Annotation annotation file: {db_file}") 3249 log.error(f"Annotation annotation header: {db_hdr_file}") 3250 log.error(f"Annotation annotation index: {db_tbi_file}") 3251 raise ValueError( 3252 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3253 ) 3254 else: 3255 3256 log.debug( 3257 f"Annotation '{annotation}' - file: " 3258 + str(db_file) 3259 + " and " 3260 + str(db_hdr_file) 3261 ) 3262 3263 # Load header as VCF object 3264 db_hdr_vcf = Variants(input=db_hdr_file) 3265 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3266 log.debug( 3267 "Annotation database header: " 3268 + str(db_hdr_vcf_header_infos) 3269 ) 3270 3271 # For all fields in database 3272 annotation_fields_full = False 3273 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3274 annotation_fields = { 3275 key: key for key in db_hdr_vcf_header_infos 3276 } 3277 log.debug( 3278 "Annotation database header - All annotations added: " 3279 + str(annotation_fields) 3280 ) 3281 annotation_fields_full = True 3282 3283 # # Create file for field rename 3284 # log.debug("Create file for field rename") 3285 # tmp_rename = NamedTemporaryFile( 3286 # prefix=self.get_prefix(), 3287 # dir=self.get_tmp_dir(), 3288 # suffix=".rename", 3289 # delete=False, 3290 # ) 3291 # tmp_rename_name = tmp_rename.name 3292 # tmp_files.append(tmp_rename_name) 3293 3294 # Number of fields 3295 nb_annotation_field = 0 3296 annotation_list = [] 3297 annotation_infos_rename_list = [] 3298 3299 for annotation_field in annotation_fields: 3300 3301 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3302 annotation_fields_new_name = annotation_fields.get( 3303 annotation_field, annotation_field 3304 ) 3305 if not annotation_fields_new_name: 3306 annotation_fields_new_name = annotation_field 3307 3308 # Check if field is in DB and if field is not elready in input data 3309 if ( 3310 annotation_field in db_hdr_vcf.get_header().infos 3311 and annotation_fields_new_name 3312 not in self.get_header().infos 3313 ): 3314 3315 log.info( 3316 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3317 ) 3318 3319 # BCFTools annotate param to rename fields 3320 if annotation_field != annotation_fields_new_name: 3321 annotation_infos_rename_list.append( 3322 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3323 ) 3324 3325 # Add INFO field to header 3326 db_hdr_vcf_header_infos_number = ( 3327 db_hdr_vcf_header_infos[annotation_field].num or "." 3328 ) 3329 db_hdr_vcf_header_infos_type = ( 3330 db_hdr_vcf_header_infos[annotation_field].type 3331 or "String" 3332 ) 3333 db_hdr_vcf_header_infos_description = ( 3334 db_hdr_vcf_header_infos[annotation_field].desc 3335 or f"{annotation_field} description" 3336 ) 3337 db_hdr_vcf_header_infos_source = ( 3338 db_hdr_vcf_header_infos[annotation_field].source 3339 or "unknown" 3340 ) 3341 db_hdr_vcf_header_infos_version = ( 3342 db_hdr_vcf_header_infos[annotation_field].version 3343 or "unknown" 3344 ) 3345 3346 vcf_reader.infos[annotation_fields_new_name] = ( 3347 vcf.parser._Info( 3348 annotation_fields_new_name, 3349 db_hdr_vcf_header_infos_number, 3350 db_hdr_vcf_header_infos_type, 3351 db_hdr_vcf_header_infos_description, 3352 db_hdr_vcf_header_infos_source, 3353 db_hdr_vcf_header_infos_version, 3354 self.code_type_map[ 3355 db_hdr_vcf_header_infos_type 3356 ], 3357 ) 3358 ) 3359 3360 annotation_list.append(annotation_field) 3361 3362 nb_annotation_field += 1 3363 3364 else: 3365 3366 if ( 3367 annotation_field 3368 not in db_hdr_vcf.get_header().infos 3369 ): 3370 log.warning( 3371 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3372 ) 3373 if ( 3374 annotation_fields_new_name 3375 in self.get_header().infos 3376 ): 3377 log.warning( 3378 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3379 ) 3380 3381 log.info( 3382 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3383 ) 3384 3385 annotation_infos = ",".join(annotation_list) 3386 3387 if annotation_infos != "": 3388 3389 # Annotated VCF (and error file) 3390 tmp_annotation_vcf_name = os.path.join( 3391 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3392 ) 3393 tmp_annotation_vcf_name_err = ( 3394 tmp_annotation_vcf_name + ".err" 3395 ) 3396 3397 # Add fields to annotate 3398 if not annotation_fields_full: 3399 annotation_infos_option = f"-info {annotation_infos}" 3400 else: 3401 annotation_infos_option = "" 3402 3403 # Info fields rename 3404 if annotation_infos_rename_list: 3405 annotation_infos_rename = " -c " + ",".join( 3406 annotation_infos_rename_list 3407 ) 3408 else: 3409 annotation_infos_rename = "" 3410 3411 # Annotate command 3412 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3413 3414 # Add command 3415 commands[command_annotate] = tmp_annotation_vcf_name 3416 3417 if commands: 3418 3419 # Export VCF file 3420 self.export_variant_vcf( 3421 vcf_file=tmp_vcf_name, 3422 remove_info=True, 3423 add_samples=False, 3424 index=True, 3425 ) 3426 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3427 3428 # Num command 3429 nb_command = 0 3430 3431 # Annotate 3432 for command_annotate in commands: 3433 nb_command += 1 3434 log.info( 3435 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3436 ) 3437 log.debug(f"command_annotate={command_annotate}") 3438 run_parallel_commands([command_annotate], threads) 3439 3440 # Debug 3441 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3442 3443 # Update variants 3444 log.info( 3445 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3446 ) 3447 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3449 def annotation_bcftools(self, threads: int = None) -> None: 3450 """ 3451 This function annotate with bcftools 3452 3453 :param threads: Number of threads to use 3454 :return: the value of the variable "return_value". 3455 """ 3456 3457 # DEBUG 3458 log.debug("Start annotation with bcftools databases") 3459 3460 # Threads 3461 if not threads: 3462 threads = self.get_threads() 3463 log.debug("Threads: " + str(threads)) 3464 3465 # Config 3466 config = self.get_config() 3467 log.debug("Config: " + str(config)) 3468 3469 # DEBUG 3470 delete_tmp = True 3471 if self.get_config().get("verbosity", "warning") in ["debug"]: 3472 delete_tmp = False 3473 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3474 3475 # Config - BCFTools bin command 3476 bcftools_bin_command = get_bin_command( 3477 bin="bcftools", 3478 tool="bcftools", 3479 bin_type="bin", 3480 config=config, 3481 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3482 ) 3483 if not bcftools_bin_command: 3484 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Config - BCFTools databases folders 3489 databases_folders = set( 3490 self.get_config() 3491 .get("folders", {}) 3492 .get("databases", {}) 3493 .get("annotations", ["."]) 3494 + self.get_config() 3495 .get("folders", {}) 3496 .get("databases", {}) 3497 .get("bcftools", ["."]) 3498 ) 3499 log.debug("Databases annotations: " + str(databases_folders)) 3500 3501 # Param 3502 annotations = ( 3503 self.get_param() 3504 .get("annotation", {}) 3505 .get("bcftools", {}) 3506 .get("annotations", None) 3507 ) 3508 log.debug("Annotations: " + str(annotations)) 3509 3510 # Assembly 3511 assembly = self.get_param().get( 3512 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3513 ) 3514 3515 # Data 3516 table_variants = self.get_table_variants() 3517 3518 # Check if not empty 3519 log.debug("Check if not empty") 3520 sql_query_chromosomes = ( 3521 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3522 ) 3523 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3524 if not sql_query_chromosomes_df["count"][0]: 3525 log.info(f"VCF empty") 3526 return 3527 3528 # Export in VCF 3529 log.debug("Create initial file to annotate") 3530 tmp_vcf = NamedTemporaryFile( 3531 prefix=self.get_prefix(), 3532 dir=self.get_tmp_dir(), 3533 suffix=".vcf.gz", 3534 delete=False, 3535 ) 3536 tmp_vcf_name = tmp_vcf.name 3537 3538 # VCF header 3539 vcf_reader = self.get_header() 3540 log.debug("Initial header: " + str(vcf_reader.infos)) 3541 3542 # Existing annotations 3543 for vcf_annotation in self.get_header().infos: 3544 3545 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3546 log.debug( 3547 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3548 ) 3549 3550 if annotations: 3551 3552 tmp_ann_vcf_list = [] 3553 commands = [] 3554 tmp_files = [] 3555 err_files = [] 3556 3557 for annotation in annotations: 3558 annotation_fields = annotations[annotation] 3559 3560 # Annotation Name 3561 annotation_name = os.path.basename(annotation) 3562 3563 if not annotation_fields: 3564 annotation_fields = {"INFO": None} 3565 3566 log.debug(f"Annotation '{annotation_name}'") 3567 log.debug( 3568 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3569 ) 3570 3571 # Create Database 3572 database = Database( 3573 database=annotation, 3574 databases_folders=databases_folders, 3575 assembly=assembly, 3576 ) 3577 3578 # Find files 3579 db_file = database.get_database() 3580 db_file = full_path(db_file) 3581 db_hdr_file = database.get_header_file() 3582 db_hdr_file = full_path(db_hdr_file) 3583 db_file_type = database.get_format() 3584 db_tbi_file = f"{db_file}.tbi" 3585 db_file_compressed = database.is_compressed() 3586 3587 # Check if compressed 3588 if not db_file_compressed: 3589 log.error( 3590 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3591 ) 3592 raise ValueError( 3593 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3594 ) 3595 3596 # Check if indexed 3597 if not os.path.exists(db_tbi_file): 3598 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3599 raise ValueError( 3600 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3601 ) 3602 3603 # Check index - try to create if not exists 3604 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3605 log.error("Annotation failed: database not valid") 3606 log.error(f"Annotation annotation file: {db_file}") 3607 log.error(f"Annotation annotation header: {db_hdr_file}") 3608 log.error(f"Annotation annotation index: {db_tbi_file}") 3609 raise ValueError( 3610 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3611 ) 3612 else: 3613 3614 log.debug( 3615 f"Annotation '{annotation}' - file: " 3616 + str(db_file) 3617 + " and " 3618 + str(db_hdr_file) 3619 ) 3620 3621 # Load header as VCF object 3622 db_hdr_vcf = Variants(input=db_hdr_file) 3623 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3624 log.debug( 3625 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3626 ) 3627 3628 # For all fields in database 3629 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3630 annotation_fields = { 3631 key: key for key in db_hdr_vcf_header_infos 3632 } 3633 log.debug( 3634 "Annotation database header - All annotations added: " 3635 + str(annotation_fields) 3636 ) 3637 3638 # Number of fields 3639 nb_annotation_field = 0 3640 annotation_list = [] 3641 3642 for annotation_field in annotation_fields: 3643 3644 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3645 annotation_fields_new_name = annotation_fields.get( 3646 annotation_field, annotation_field 3647 ) 3648 if not annotation_fields_new_name: 3649 annotation_fields_new_name = annotation_field 3650 3651 # Check if field is in DB and if field is not elready in input data 3652 if ( 3653 annotation_field in db_hdr_vcf.get_header().infos 3654 and annotation_fields_new_name 3655 not in self.get_header().infos 3656 ): 3657 3658 log.info( 3659 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3660 ) 3661 3662 # Add INFO field to header 3663 db_hdr_vcf_header_infos_number = ( 3664 db_hdr_vcf_header_infos[annotation_field].num or "." 3665 ) 3666 db_hdr_vcf_header_infos_type = ( 3667 db_hdr_vcf_header_infos[annotation_field].type 3668 or "String" 3669 ) 3670 db_hdr_vcf_header_infos_description = ( 3671 db_hdr_vcf_header_infos[annotation_field].desc 3672 or f"{annotation_field} description" 3673 ) 3674 db_hdr_vcf_header_infos_source = ( 3675 db_hdr_vcf_header_infos[annotation_field].source 3676 or "unknown" 3677 ) 3678 db_hdr_vcf_header_infos_version = ( 3679 db_hdr_vcf_header_infos[annotation_field].version 3680 or "unknown" 3681 ) 3682 3683 vcf_reader.infos[annotation_fields_new_name] = ( 3684 vcf.parser._Info( 3685 annotation_fields_new_name, 3686 db_hdr_vcf_header_infos_number, 3687 db_hdr_vcf_header_infos_type, 3688 db_hdr_vcf_header_infos_description, 3689 db_hdr_vcf_header_infos_source, 3690 db_hdr_vcf_header_infos_version, 3691 self.code_type_map[db_hdr_vcf_header_infos_type], 3692 ) 3693 ) 3694 3695 # annotation_list.append(annotation_field) 3696 if annotation_field != annotation_fields_new_name: 3697 annotation_list.append( 3698 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3699 ) 3700 else: 3701 annotation_list.append(annotation_field) 3702 3703 nb_annotation_field += 1 3704 3705 else: 3706 3707 if annotation_field not in db_hdr_vcf.get_header().infos: 3708 log.warning( 3709 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3710 ) 3711 if annotation_fields_new_name in self.get_header().infos: 3712 log.warning( 3713 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3714 ) 3715 3716 log.info( 3717 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3718 ) 3719 3720 annotation_infos = ",".join(annotation_list) 3721 3722 if annotation_infos != "": 3723 3724 # Protect header for bcftools (remove "#CHROM" and variants line) 3725 log.debug("Protect Header file - remove #CHROM line if exists") 3726 tmp_header_vcf = NamedTemporaryFile( 3727 prefix=self.get_prefix(), 3728 dir=self.get_tmp_dir(), 3729 suffix=".hdr", 3730 delete=False, 3731 ) 3732 tmp_header_vcf_name = tmp_header_vcf.name 3733 tmp_files.append(tmp_header_vcf_name) 3734 # Command 3735 if db_hdr_file.endswith(".gz"): 3736 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3737 else: 3738 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3739 # Run 3740 run_parallel_commands([command_extract_header], 1) 3741 3742 # Find chomosomes 3743 log.debug("Find chromosomes ") 3744 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3745 sql_query_chromosomes_df = self.get_query_to_df( 3746 sql_query_chromosomes 3747 ) 3748 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3749 3750 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3751 3752 # BED columns in the annotation file 3753 if db_file_type in ["bed"]: 3754 annotation_infos = "CHROM,POS,POS," + annotation_infos 3755 3756 for chrom in chomosomes_list: 3757 3758 # Create BED on initial VCF 3759 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3760 tmp_bed = NamedTemporaryFile( 3761 prefix=self.get_prefix(), 3762 dir=self.get_tmp_dir(), 3763 suffix=".bed", 3764 delete=False, 3765 ) 3766 tmp_bed_name = tmp_bed.name 3767 tmp_files.append(tmp_bed_name) 3768 3769 # Detecte regions 3770 log.debug( 3771 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3772 ) 3773 window = 1000000 3774 sql_query_intervals_for_bed = f""" 3775 SELECT \"#CHROM\", 3776 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3777 \"POS\"+{window} 3778 FROM {table_variants} as table_variants 3779 WHERE table_variants.\"#CHROM\" = '{chrom}' 3780 """ 3781 regions = self.conn.execute( 3782 sql_query_intervals_for_bed 3783 ).fetchall() 3784 merged_regions = merge_regions(regions) 3785 log.debug( 3786 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3787 ) 3788 3789 header = ["#CHROM", "START", "END"] 3790 with open(tmp_bed_name, "w") as f: 3791 # Write the header with tab delimiter 3792 f.write("\t".join(header) + "\n") 3793 for d in merged_regions: 3794 # Write each data row with tab delimiter 3795 f.write("\t".join(map(str, d)) + "\n") 3796 3797 # Tmp files 3798 tmp_annotation_vcf = NamedTemporaryFile( 3799 prefix=self.get_prefix(), 3800 dir=self.get_tmp_dir(), 3801 suffix=".vcf.gz", 3802 delete=False, 3803 ) 3804 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3805 tmp_files.append(tmp_annotation_vcf_name) 3806 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3807 tmp_annotation_vcf_name_err = ( 3808 tmp_annotation_vcf_name + ".err" 3809 ) 3810 err_files.append(tmp_annotation_vcf_name_err) 3811 3812 # Annotate Command 3813 log.debug( 3814 f"Annotation '{annotation}' - add bcftools command" 3815 ) 3816 3817 # Command 3818 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3819 3820 # Add command 3821 commands.append(command_annotate) 3822 3823 # if some commands 3824 if commands: 3825 3826 # Export VCF file 3827 self.export_variant_vcf( 3828 vcf_file=tmp_vcf_name, 3829 remove_info=True, 3830 add_samples=False, 3831 index=True, 3832 ) 3833 3834 # Threads 3835 # calculate threads for annotated commands 3836 if commands: 3837 threads_bcftools_annotate = round(threads / len(commands)) 3838 else: 3839 threads_bcftools_annotate = 1 3840 3841 if not threads_bcftools_annotate: 3842 threads_bcftools_annotate = 1 3843 3844 # Add threads option to bcftools commands 3845 if threads_bcftools_annotate > 1: 3846 commands_threaded = [] 3847 for command in commands: 3848 commands_threaded.append( 3849 command.replace( 3850 f"{bcftools_bin_command} annotate ", 3851 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3852 ) 3853 ) 3854 commands = commands_threaded 3855 3856 # Command annotation multithreading 3857 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3858 log.info( 3859 f"Annotation - Annotation multithreaded in " 3860 + str(len(commands)) 3861 + " commands" 3862 ) 3863 3864 run_parallel_commands(commands, threads) 3865 3866 # Merge 3867 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3868 3869 if tmp_ann_vcf_list_cmd: 3870 3871 # Tmp file 3872 tmp_annotate_vcf = NamedTemporaryFile( 3873 prefix=self.get_prefix(), 3874 dir=self.get_tmp_dir(), 3875 suffix=".vcf.gz", 3876 delete=True, 3877 ) 3878 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3879 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3880 err_files.append(tmp_annotate_vcf_name_err) 3881 3882 # Tmp file remove command 3883 tmp_files_remove_command = "" 3884 if tmp_files: 3885 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3886 3887 # Command merge 3888 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3889 log.info( 3890 f"Annotation - Annotation merging " 3891 + str(len(commands)) 3892 + " annotated files" 3893 ) 3894 log.debug(f"Annotation - merge command: {merge_command}") 3895 run_parallel_commands([merge_command], 1) 3896 3897 # Error messages 3898 log.info(f"Error/Warning messages:") 3899 error_message_command_all = [] 3900 error_message_command_warning = [] 3901 error_message_command_err = [] 3902 for err_file in err_files: 3903 with open(err_file, "r") as f: 3904 for line in f: 3905 message = line.strip() 3906 error_message_command_all.append(message) 3907 if line.startswith("[W::"): 3908 error_message_command_warning.append(message) 3909 if line.startswith("[E::"): 3910 error_message_command_err.append( 3911 f"{err_file}: " + message 3912 ) 3913 # log info 3914 for message in list( 3915 set(error_message_command_err + error_message_command_warning) 3916 ): 3917 log.info(f" {message}") 3918 # debug info 3919 for message in list(set(error_message_command_all)): 3920 log.debug(f" {message}") 3921 # failed 3922 if len(error_message_command_err): 3923 log.error("Annotation failed: Error in commands") 3924 raise ValueError("Annotation failed: Error in commands") 3925 3926 # Update variants 3927 log.info(f"Annotation - Updating...") 3928 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3930 def annotation_exomiser(self, threads: int = None) -> None: 3931 """ 3932 This function annotate with Exomiser 3933 3934 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3935 - "analysis" (dict/file): 3936 Full analysis dictionnary parameters (see Exomiser docs). 3937 Either a dict, or a file in JSON or YAML format. 3938 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3939 Default : None 3940 - "preset" (string): 3941 Analysis preset (available in config folder). 3942 Used if no full "analysis" is provided. 3943 Default: "exome" 3944 - "phenopacket" (dict/file): 3945 Samples and phenotipic features parameters (see Exomiser docs). 3946 Either a dict, or a file in JSON or YAML format. 3947 Default: None 3948 - "subject" (dict): 3949 Sample parameters (see Exomiser docs). 3950 Example: 3951 "subject": 3952 { 3953 "id": "ISDBM322017", 3954 "sex": "FEMALE" 3955 } 3956 Default: None 3957 - "sample" (string): 3958 Sample name to construct "subject" section: 3959 "subject": 3960 { 3961 "id": "<sample>", 3962 "sex": "UNKNOWN_SEX" 3963 } 3964 Default: None 3965 - "phenotypicFeatures" (dict) 3966 Phenotypic features to construct "subject" section. 3967 Example: 3968 "phenotypicFeatures": 3969 [ 3970 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3971 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3972 ] 3973 - "hpo" (list) 3974 List of HPO ids as phenotypic features. 3975 Example: 3976 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3977 Default: [] 3978 - "outputOptions" (dict): 3979 Output options (see Exomiser docs). 3980 Default: 3981 "output_options" = 3982 { 3983 "outputContributingVariantsOnly": False, 3984 "numGenes": 0, 3985 "outputFormats": ["TSV_VARIANT", "VCF"] 3986 } 3987 - "transcript_source" (string): 3988 Transcript source (either "refseq", "ucsc", "ensembl") 3989 Default: "refseq" 3990 - "exomiser_to_info" (boolean): 3991 Add exomiser TSV file columns as INFO fields in VCF. 3992 Default: False 3993 - "release" (string): 3994 Exomise database release. 3995 If not exists, database release will be downloaded (take a while). 3996 Default: None (provided by application.properties configuration file) 3997 - "exomiser_application_properties" (file): 3998 Exomiser configuration file (see Exomiser docs). 3999 Useful to automatically download databases (especially for specific genome databases). 4000 4001 Notes: 4002 - If no sample in parameters, first sample in VCF will be chosen 4003 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4004 4005 :param threads: The number of threads to use 4006 :return: None. 4007 """ 4008 4009 # DEBUG 4010 log.debug("Start annotation with Exomiser databases") 4011 4012 # Threads 4013 if not threads: 4014 threads = self.get_threads() 4015 log.debug("Threads: " + str(threads)) 4016 4017 # Config 4018 config = self.get_config() 4019 log.debug("Config: " + str(config)) 4020 4021 # Config - Folders - Databases 4022 databases_folders = ( 4023 config.get("folders", {}) 4024 .get("databases", {}) 4025 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4026 ) 4027 databases_folders = full_path(databases_folders) 4028 if not os.path.exists(databases_folders): 4029 log.error(f"Databases annotations: {databases_folders} NOT found") 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Config - Exomiser 4033 exomiser_bin_command = get_bin_command( 4034 bin="exomiser-cli*.jar", 4035 tool="exomiser", 4036 bin_type="jar", 4037 config=config, 4038 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4039 ) 4040 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4041 if not exomiser_bin_command: 4042 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4043 log.error(msg_err) 4044 raise ValueError(msg_err) 4045 4046 # Param 4047 param = self.get_param() 4048 log.debug("Param: " + str(param)) 4049 4050 # Param - Exomiser 4051 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4052 log.debug(f"Param Exomiser: {param_exomiser}") 4053 4054 # Param - Assembly 4055 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4056 log.debug("Assembly: " + str(assembly)) 4057 4058 # Data 4059 table_variants = self.get_table_variants() 4060 4061 # Check if not empty 4062 log.debug("Check if not empty") 4063 sql_query_chromosomes = ( 4064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4065 ) 4066 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4067 log.info(f"VCF empty") 4068 return False 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Samples 4075 samples = self.get_header_sample_list() 4076 if not samples: 4077 log.error("No Samples in VCF") 4078 return False 4079 log.debug(f"Samples: {samples}") 4080 4081 # Memory limit 4082 memory_limit = self.get_memory("8G") 4083 log.debug(f"memory_limit: {memory_limit}") 4084 4085 # Exomiser java options 4086 exomiser_java_options = ( 4087 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4088 ) 4089 log.debug(f"Exomiser java options: {exomiser_java_options}") 4090 4091 # Download Exomiser (if not exists) 4092 exomiser_release = param_exomiser.get("release", None) 4093 exomiser_application_properties = param_exomiser.get( 4094 "exomiser_application_properties", None 4095 ) 4096 databases_download_exomiser( 4097 assemblies=[assembly], 4098 exomiser_folder=databases_folders, 4099 exomiser_release=exomiser_release, 4100 exomiser_phenotype_release=exomiser_release, 4101 exomiser_application_properties=exomiser_application_properties, 4102 ) 4103 4104 # Force annotation 4105 force_update_annotation = True 4106 4107 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4108 log.debug("Start annotation Exomiser") 4109 4110 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4111 4112 # tmp_dir = "/tmp/exomiser" 4113 4114 ### ANALYSIS ### 4115 ################ 4116 4117 # Create analysis.json through analysis dict 4118 # either analysis in param or by default 4119 # depending on preset exome/genome) 4120 4121 # Init analysis dict 4122 param_exomiser_analysis_dict = {} 4123 4124 # analysis from param 4125 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4126 param_exomiser_analysis = full_path(param_exomiser_analysis) 4127 4128 # If analysis in param -> load anlaysis json 4129 if param_exomiser_analysis: 4130 4131 # If param analysis is a file and exists 4132 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4133 param_exomiser_analysis 4134 ): 4135 # Load analysis file into analysis dict (either yaml or json) 4136 with open(param_exomiser_analysis) as json_file: 4137 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4138 4139 # If param analysis is a dict 4140 elif isinstance(param_exomiser_analysis, dict): 4141 # Load analysis dict into analysis dict (either yaml or json) 4142 param_exomiser_analysis_dict = param_exomiser_analysis 4143 4144 # Error analysis type 4145 else: 4146 log.error(f"Analysis type unknown. Check param file.") 4147 raise ValueError(f"Analysis type unknown. Check param file.") 4148 4149 # Case no input analysis config file/dict 4150 # Use preset (exome/genome) to open default config file 4151 if not param_exomiser_analysis_dict: 4152 4153 # default preset 4154 default_preset = "exome" 4155 4156 # Get param preset or default preset 4157 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4158 4159 # Try to find if preset is a file 4160 if os.path.exists(param_exomiser_preset): 4161 # Preset file is provided in full path 4162 param_exomiser_analysis_default_config_file = ( 4163 param_exomiser_preset 4164 ) 4165 # elif os.path.exists(full_path(param_exomiser_preset)): 4166 # # Preset file is provided in full path 4167 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4168 elif os.path.exists( 4169 os.path.join(folder_config, param_exomiser_preset) 4170 ): 4171 # Preset file is provided a basename in config folder (can be a path with subfolders) 4172 param_exomiser_analysis_default_config_file = os.path.join( 4173 folder_config, param_exomiser_preset 4174 ) 4175 else: 4176 # Construct preset file 4177 param_exomiser_analysis_default_config_file = os.path.join( 4178 folder_config, 4179 f"preset-{param_exomiser_preset}-analysis.json", 4180 ) 4181 4182 # If preset file exists 4183 param_exomiser_analysis_default_config_file = full_path( 4184 param_exomiser_analysis_default_config_file 4185 ) 4186 if os.path.exists(param_exomiser_analysis_default_config_file): 4187 # Load prest file into analysis dict (either yaml or json) 4188 with open( 4189 param_exomiser_analysis_default_config_file 4190 ) as json_file: 4191 # param_exomiser_analysis_dict[""] = json.load(json_file) 4192 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4193 json_file 4194 ) 4195 4196 # Error preset file 4197 else: 4198 log.error( 4199 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4200 ) 4201 raise ValueError( 4202 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4203 ) 4204 4205 # If no analysis dict created 4206 if not param_exomiser_analysis_dict: 4207 log.error(f"No analysis config") 4208 raise ValueError(f"No analysis config") 4209 4210 # Log 4211 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4212 4213 ### PHENOPACKET ### 4214 ################### 4215 4216 # If no PhenoPacket in analysis dict -> check in param 4217 if "phenopacket" not in param_exomiser_analysis_dict: 4218 4219 # If PhenoPacket in param -> load anlaysis json 4220 if param_exomiser.get("phenopacket", None): 4221 4222 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4223 param_exomiser_phenopacket = full_path( 4224 param_exomiser_phenopacket 4225 ) 4226 4227 # If param phenopacket is a file and exists 4228 if isinstance( 4229 param_exomiser_phenopacket, str 4230 ) and os.path.exists(param_exomiser_phenopacket): 4231 # Load phenopacket file into analysis dict (either yaml or json) 4232 with open(param_exomiser_phenopacket) as json_file: 4233 param_exomiser_analysis_dict["phenopacket"] = ( 4234 yaml.safe_load(json_file) 4235 ) 4236 4237 # If param phenopacket is a dict 4238 elif isinstance(param_exomiser_phenopacket, dict): 4239 # Load phenopacket dict into analysis dict (either yaml or json) 4240 param_exomiser_analysis_dict["phenopacket"] = ( 4241 param_exomiser_phenopacket 4242 ) 4243 4244 # Error phenopacket type 4245 else: 4246 log.error(f"Phenopacket type unknown. Check param file.") 4247 raise ValueError( 4248 f"Phenopacket type unknown. Check param file." 4249 ) 4250 4251 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4252 if "phenopacket" not in param_exomiser_analysis_dict: 4253 4254 # Init PhenoPacket 4255 param_exomiser_analysis_dict["phenopacket"] = { 4256 "id": "analysis", 4257 "proband": {}, 4258 } 4259 4260 ### Add subject ### 4261 4262 # If subject exists 4263 param_exomiser_subject = param_exomiser.get("subject", {}) 4264 4265 # If subject not exists -> found sample ID 4266 if not param_exomiser_subject: 4267 4268 # Found sample ID in param 4269 sample = param_exomiser.get("sample", None) 4270 4271 # Find sample ID (first sample) 4272 if not sample: 4273 sample_list = self.get_header_sample_list() 4274 if len(sample_list) > 0: 4275 sample = sample_list[0] 4276 else: 4277 log.error(f"No sample found") 4278 raise ValueError(f"No sample found") 4279 4280 # Create subject 4281 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4282 4283 # Add to dict 4284 param_exomiser_analysis_dict["phenopacket"][ 4285 "subject" 4286 ] = param_exomiser_subject 4287 4288 ### Add "phenotypicFeatures" ### 4289 4290 # If phenotypicFeatures exists 4291 param_exomiser_phenotypicfeatures = param_exomiser.get( 4292 "phenotypicFeatures", [] 4293 ) 4294 4295 # If phenotypicFeatures not exists -> Try to infer from hpo list 4296 if not param_exomiser_phenotypicfeatures: 4297 4298 # Found HPO in param 4299 param_exomiser_hpo = param_exomiser.get("hpo", []) 4300 4301 # Split HPO if list in string format separated by comma 4302 if isinstance(param_exomiser_hpo, str): 4303 param_exomiser_hpo = param_exomiser_hpo.split(",") 4304 4305 # Create HPO list 4306 for hpo in param_exomiser_hpo: 4307 hpo_clean = re.sub("[^0-9]", "", hpo) 4308 param_exomiser_phenotypicfeatures.append( 4309 { 4310 "type": { 4311 "id": f"HP:{hpo_clean}", 4312 "label": f"HP:{hpo_clean}", 4313 } 4314 } 4315 ) 4316 4317 # Add to dict 4318 param_exomiser_analysis_dict["phenopacket"][ 4319 "phenotypicFeatures" 4320 ] = param_exomiser_phenotypicfeatures 4321 4322 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4323 if not param_exomiser_phenotypicfeatures: 4324 for step in param_exomiser_analysis_dict.get( 4325 "analysis", {} 4326 ).get("steps", []): 4327 if "hiPhivePrioritiser" in step: 4328 param_exomiser_analysis_dict.get("analysis", {}).get( 4329 "steps", [] 4330 ).remove(step) 4331 4332 ### Add Input File ### 4333 4334 # Initial file name and htsFiles 4335 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4336 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4337 { 4338 "uri": tmp_vcf_name, 4339 "htsFormat": "VCF", 4340 "genomeAssembly": assembly, 4341 } 4342 ] 4343 4344 ### Add metaData ### 4345 4346 # If metaData not in analysis dict 4347 if "metaData" not in param_exomiser_analysis_dict: 4348 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4349 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4350 "createdBy": "howard", 4351 "phenopacketSchemaVersion": 1, 4352 } 4353 4354 ### OutputOptions ### 4355 4356 # Init output result folder 4357 output_results = os.path.join(tmp_dir, "results") 4358 4359 # If no outputOptions in analysis dict 4360 if "outputOptions" not in param_exomiser_analysis_dict: 4361 4362 # default output formats 4363 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4364 4365 # Get outputOptions in param 4366 output_options = param_exomiser.get("outputOptions", None) 4367 4368 # If no output_options in param -> check 4369 if not output_options: 4370 output_options = { 4371 "outputContributingVariantsOnly": False, 4372 "numGenes": 0, 4373 "outputFormats": defaut_output_formats, 4374 } 4375 4376 # Replace outputDirectory in output options 4377 output_options["outputDirectory"] = output_results 4378 output_options["outputFileName"] = "howard" 4379 4380 # Add outputOptions in analysis dict 4381 param_exomiser_analysis_dict["outputOptions"] = output_options 4382 4383 else: 4384 4385 # Replace output_results and output format (if exists in param) 4386 param_exomiser_analysis_dict["outputOptions"][ 4387 "outputDirectory" 4388 ] = output_results 4389 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4390 list( 4391 set( 4392 param_exomiser_analysis_dict.get( 4393 "outputOptions", {} 4394 ).get("outputFormats", []) 4395 + ["TSV_VARIANT", "VCF"] 4396 ) 4397 ) 4398 ) 4399 4400 # log 4401 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4402 4403 ### ANALYSIS FILE ### 4404 ##################### 4405 4406 ### Full JSON analysis config file ### 4407 4408 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4409 with open(exomiser_analysis, "w") as fp: 4410 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4411 4412 ### SPLIT analysis and sample config files 4413 4414 # Splitted analysis dict 4415 param_exomiser_analysis_dict_for_split = ( 4416 param_exomiser_analysis_dict.copy() 4417 ) 4418 4419 # Phenopacket JSON file 4420 exomiser_analysis_phenopacket = os.path.join( 4421 tmp_dir, "analysis_phenopacket.json" 4422 ) 4423 with open(exomiser_analysis_phenopacket, "w") as fp: 4424 json.dump( 4425 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4426 fp, 4427 indent=4, 4428 ) 4429 4430 # Analysis JSON file without Phenopacket parameters 4431 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4432 exomiser_analysis_analysis = os.path.join( 4433 tmp_dir, "analysis_analysis.json" 4434 ) 4435 with open(exomiser_analysis_analysis, "w") as fp: 4436 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4437 4438 ### INITAL VCF file ### 4439 ####################### 4440 4441 ### Create list of samples to use and include inti initial VCF file #### 4442 4443 # Subject (main sample) 4444 # Get sample ID in analysis dict 4445 sample_subject = ( 4446 param_exomiser_analysis_dict.get("phenopacket", {}) 4447 .get("subject", {}) 4448 .get("id", None) 4449 ) 4450 sample_proband = ( 4451 param_exomiser_analysis_dict.get("phenopacket", {}) 4452 .get("proband", {}) 4453 .get("subject", {}) 4454 .get("id", None) 4455 ) 4456 sample = [] 4457 if sample_subject: 4458 sample.append(sample_subject) 4459 if sample_proband: 4460 sample.append(sample_proband) 4461 4462 # Get sample ID within Pedigree 4463 pedigree_persons_list = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("pedigree", {}) 4466 .get("persons", {}) 4467 ) 4468 4469 # Create list with all sample ID in pedigree (if exists) 4470 pedigree_persons = [] 4471 for person in pedigree_persons_list: 4472 pedigree_persons.append(person.get("individualId")) 4473 4474 # Concat subject sample ID and samples ID in pedigreesamples 4475 samples = list(set(sample + pedigree_persons)) 4476 4477 # Check if sample list is not empty 4478 if not samples: 4479 log.error(f"No samples found") 4480 raise ValueError(f"No samples found") 4481 4482 # Create VCF with sample (either sample in param or first one by default) 4483 # Export VCF file 4484 self.export_variant_vcf( 4485 vcf_file=tmp_vcf_name, 4486 remove_info=True, 4487 add_samples=True, 4488 list_samples=samples, 4489 index=False, 4490 ) 4491 4492 ### Execute Exomiser ### 4493 ######################## 4494 4495 # Init command 4496 exomiser_command = "" 4497 4498 # Command exomiser options 4499 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4500 4501 # Release 4502 exomiser_release = param_exomiser.get("release", None) 4503 if exomiser_release: 4504 # phenotype data version 4505 exomiser_options += ( 4506 f" --exomiser.phenotype.data-version={exomiser_release} " 4507 ) 4508 # data version 4509 exomiser_options += ( 4510 f" --exomiser.{assembly}.data-version={exomiser_release} " 4511 ) 4512 # variant white list 4513 variant_white_list_file = ( 4514 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4515 ) 4516 if os.path.exists( 4517 os.path.join( 4518 databases_folders, assembly, variant_white_list_file 4519 ) 4520 ): 4521 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4522 4523 # transcript_source 4524 transcript_source = param_exomiser.get( 4525 "transcript_source", None 4526 ) # ucsc, refseq, ensembl 4527 if transcript_source: 4528 exomiser_options += ( 4529 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4530 ) 4531 4532 # If analysis contain proband param 4533 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4534 "proband", {} 4535 ): 4536 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4537 4538 # If no proband (usually uniq sample) 4539 else: 4540 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4541 4542 # Log 4543 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4544 4545 # Run command 4546 result = subprocess.call( 4547 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4548 ) 4549 if result: 4550 log.error("Exomiser command failed") 4551 raise ValueError("Exomiser command failed") 4552 4553 ### RESULTS ### 4554 ############### 4555 4556 ### Annotate with TSV fields ### 4557 4558 # Init result tsv file 4559 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4560 4561 # Init result tsv file 4562 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4563 4564 # Parse TSV file and explode columns in INFO field 4565 if exomiser_to_info and os.path.exists(output_results_tsv): 4566 4567 # Log 4568 log.debug("Exomiser columns to VCF INFO field") 4569 4570 # Retrieve columns and types 4571 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4572 output_results_tsv_df = self.get_query_to_df(query) 4573 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4574 4575 # Init concat fields for update 4576 sql_query_update_concat_fields = [] 4577 4578 # Fields to avoid 4579 fields_to_avoid = [ 4580 "CONTIG", 4581 "START", 4582 "END", 4583 "REF", 4584 "ALT", 4585 "QUAL", 4586 "FILTER", 4587 "GENOTYPE", 4588 ] 4589 4590 # List all columns to add into header 4591 for header_column in output_results_tsv_columns: 4592 4593 # If header column is enable 4594 if header_column not in fields_to_avoid: 4595 4596 # Header info type 4597 header_info_type = "String" 4598 header_column_df = output_results_tsv_df[header_column] 4599 header_column_df_dtype = header_column_df.dtype 4600 if header_column_df_dtype == object: 4601 if ( 4602 pd.to_numeric(header_column_df, errors="coerce") 4603 .notnull() 4604 .all() 4605 ): 4606 header_info_type = "Float" 4607 else: 4608 header_info_type = "Integer" 4609 4610 # Header info 4611 characters_to_validate = ["-"] 4612 pattern = "[" + "".join(characters_to_validate) + "]" 4613 header_info_name = re.sub( 4614 pattern, 4615 "_", 4616 f"Exomiser_{header_column}".replace("#", ""), 4617 ) 4618 header_info_number = "." 4619 header_info_description = ( 4620 f"Exomiser {header_column} annotation" 4621 ) 4622 header_info_source = "Exomiser" 4623 header_info_version = "unknown" 4624 header_info_code = CODE_TYPE_MAP[header_info_type] 4625 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4626 header_info_name, 4627 header_info_number, 4628 header_info_type, 4629 header_info_description, 4630 header_info_source, 4631 header_info_version, 4632 header_info_code, 4633 ) 4634 4635 # Add field to add for update to concat fields 4636 sql_query_update_concat_fields.append( 4637 f""" 4638 CASE 4639 WHEN table_parquet."{header_column}" NOT IN ('','.') 4640 THEN concat( 4641 '{header_info_name}=', 4642 table_parquet."{header_column}", 4643 ';' 4644 ) 4645 4646 ELSE '' 4647 END 4648 """ 4649 ) 4650 4651 # Update query 4652 sql_query_update = f""" 4653 UPDATE {table_variants} as table_variants 4654 SET INFO = concat( 4655 CASE 4656 WHEN INFO NOT IN ('', '.') 4657 THEN INFO 4658 ELSE '' 4659 END, 4660 CASE 4661 WHEN table_variants.INFO NOT IN ('','.') 4662 THEN ';' 4663 ELSE '' 4664 END, 4665 ( 4666 SELECT 4667 concat( 4668 {",".join(sql_query_update_concat_fields)} 4669 ) 4670 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4671 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4672 AND table_parquet.\"START\" = table_variants.\"POS\" 4673 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4674 AND table_parquet.\"REF\" = table_variants.\"REF\" 4675 ) 4676 ) 4677 ; 4678 """ 4679 4680 # Update 4681 self.conn.execute(sql_query_update) 4682 4683 ### Annotate with VCF INFO field ### 4684 4685 # Init result VCF file 4686 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4687 4688 # If VCF exists 4689 if os.path.exists(output_results_vcf): 4690 4691 # Log 4692 log.debug("Exomiser result VCF update variants") 4693 4694 # Find Exomiser INFO field annotation in header 4695 with gzip.open(output_results_vcf, "rt") as f: 4696 header_list = self.read_vcf_header(f) 4697 exomiser_vcf_header = vcf.Reader( 4698 io.StringIO("\n".join(header_list)) 4699 ) 4700 4701 # Add annotation INFO field to header 4702 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4703 4704 # Update variants with VCF 4705 self.update_from_vcf(output_results_vcf) 4706 4707 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4709 def annotation_snpeff(self, threads: int = None) -> None: 4710 """ 4711 This function annotate with snpEff 4712 4713 :param threads: The number of threads to use 4714 :return: the value of the variable "return_value". 4715 """ 4716 4717 # DEBUG 4718 log.debug("Start annotation with snpeff databases") 4719 4720 # Threads 4721 if not threads: 4722 threads = self.get_threads() 4723 log.debug("Threads: " + str(threads)) 4724 4725 # DEBUG 4726 delete_tmp = True 4727 if self.get_config().get("verbosity", "warning") in ["debug"]: 4728 delete_tmp = False 4729 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4730 4731 # Config 4732 config = self.get_config() 4733 log.debug("Config: " + str(config)) 4734 4735 # Config - Folders - Databases 4736 databases_folders = ( 4737 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4738 ) 4739 log.debug("Databases annotations: " + str(databases_folders)) 4740 4741 # # Config - Java 4742 # java_bin = get_bin( 4743 # tool="java", 4744 # bin="java", 4745 # bin_type="bin", 4746 # config=config, 4747 # default_folder="/usr/bin", 4748 # ) 4749 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4750 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4751 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4752 4753 # # Config - snpEff bin 4754 # snpeff_jar = get_bin( 4755 # tool="snpeff", 4756 # bin="snpEff.jar", 4757 # bin_type="jar", 4758 # config=config, 4759 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4760 # ) 4761 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4762 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4764 4765 # Config - snpEff bin command 4766 snpeff_bin_command = get_bin_command( 4767 bin="snpEff.jar", 4768 tool="snpeff", 4769 bin_type="jar", 4770 config=config, 4771 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 ) 4773 if not snpeff_bin_command: 4774 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4775 log.error(msg_err) 4776 raise ValueError(msg_err) 4777 4778 # Config - snpEff databases 4779 snpeff_databases = ( 4780 config.get("folders", {}) 4781 .get("databases", {}) 4782 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4783 ) 4784 snpeff_databases = full_path(snpeff_databases) 4785 if snpeff_databases is not None and snpeff_databases != "": 4786 log.debug(f"Create snpEff databases folder") 4787 if not os.path.exists(snpeff_databases): 4788 os.makedirs(snpeff_databases) 4789 4790 # Param 4791 param = self.get_param() 4792 log.debug("Param: " + str(param)) 4793 4794 # Param 4795 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4796 log.debug("Options: " + str(options)) 4797 4798 # Param - Assembly 4799 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4800 4801 # Param - Options 4802 snpeff_options = ( 4803 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4804 ) 4805 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4806 snpeff_csvstats = ( 4807 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4808 ) 4809 if snpeff_stats: 4810 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4811 snpeff_stats = full_path(snpeff_stats) 4812 snpeff_options += f" -stats {snpeff_stats}" 4813 if snpeff_csvstats: 4814 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4815 snpeff_csvstats = full_path(snpeff_csvstats) 4816 snpeff_options += f" -csvStats {snpeff_csvstats}" 4817 4818 # Data 4819 table_variants = self.get_table_variants() 4820 4821 # Check if not empty 4822 log.debug("Check if not empty") 4823 sql_query_chromosomes = ( 4824 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4825 ) 4826 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4827 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4828 log.info(f"VCF empty") 4829 return 4830 4831 # Export in VCF 4832 log.debug("Create initial file to annotate") 4833 tmp_vcf = NamedTemporaryFile( 4834 prefix=self.get_prefix(), 4835 dir=self.get_tmp_dir(), 4836 suffix=".vcf.gz", 4837 delete=True, 4838 ) 4839 tmp_vcf_name = tmp_vcf.name 4840 4841 # VCF header 4842 vcf_reader = self.get_header() 4843 log.debug("Initial header: " + str(vcf_reader.infos)) 4844 4845 # Existing annotations 4846 for vcf_annotation in self.get_header().infos: 4847 4848 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4849 log.debug( 4850 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4851 ) 4852 4853 # Memory limit 4854 # if config.get("memory", None): 4855 # memory_limit = config.get("memory", "8G") 4856 # else: 4857 # memory_limit = "8G" 4858 memory_limit = self.get_memory("8G") 4859 log.debug(f"memory_limit: {memory_limit}") 4860 4861 # snpEff java options 4862 snpeff_java_options = ( 4863 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4864 ) 4865 log.debug(f"Exomiser java options: {snpeff_java_options}") 4866 4867 force_update_annotation = True 4868 4869 if "ANN" not in self.get_header().infos or force_update_annotation: 4870 4871 # Check snpEff database 4872 log.debug(f"Check snpEff databases {[assembly]}") 4873 databases_download_snpeff( 4874 folder=snpeff_databases, assemblies=[assembly], config=config 4875 ) 4876 4877 # Export VCF file 4878 self.export_variant_vcf( 4879 vcf_file=tmp_vcf_name, 4880 remove_info=True, 4881 add_samples=False, 4882 index=True, 4883 ) 4884 4885 # Tmp file 4886 err_files = [] 4887 tmp_annotate_vcf = NamedTemporaryFile( 4888 prefix=self.get_prefix(), 4889 dir=self.get_tmp_dir(), 4890 suffix=".vcf", 4891 delete=False, 4892 ) 4893 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4894 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4895 err_files.append(tmp_annotate_vcf_name_err) 4896 4897 # Command 4898 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4899 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4900 run_parallel_commands([snpeff_command], 1) 4901 4902 # Error messages 4903 log.info(f"Error/Warning messages:") 4904 error_message_command_all = [] 4905 error_message_command_warning = [] 4906 error_message_command_err = [] 4907 for err_file in err_files: 4908 with open(err_file, "r") as f: 4909 for line in f: 4910 message = line.strip() 4911 error_message_command_all.append(message) 4912 if line.startswith("[W::"): 4913 error_message_command_warning.append(message) 4914 if line.startswith("[E::"): 4915 error_message_command_err.append(f"{err_file}: " + message) 4916 # log info 4917 for message in list( 4918 set(error_message_command_err + error_message_command_warning) 4919 ): 4920 log.info(f" {message}") 4921 # debug info 4922 for message in list(set(error_message_command_all)): 4923 log.debug(f" {message}") 4924 # failed 4925 if len(error_message_command_err): 4926 log.error("Annotation failed: Error in commands") 4927 raise ValueError("Annotation failed: Error in commands") 4928 4929 # Find annotation in header 4930 with open(tmp_annotate_vcf_name, "rt") as f: 4931 header_list = self.read_vcf_header(f) 4932 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4933 4934 for ann in annovar_vcf_header.infos: 4935 if ann not in self.get_header().infos: 4936 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4937 4938 # Update variants 4939 log.info(f"Annotation - Updating...") 4940 self.update_from_vcf(tmp_annotate_vcf_name) 4941 4942 else: 4943 if "ANN" in self.get_header().infos: 4944 log.debug(f"Existing snpEff annotations in VCF") 4945 if force_update_annotation: 4946 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
4948 def annotation_annovar(self, threads: int = None) -> None: 4949 """ 4950 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4951 annotations 4952 4953 :param threads: number of threads to use 4954 :return: the value of the variable "return_value". 4955 """ 4956 4957 # DEBUG 4958 log.debug("Start annotation with Annovar databases") 4959 4960 # Threads 4961 if not threads: 4962 threads = self.get_threads() 4963 log.debug("Threads: " + str(threads)) 4964 4965 # Tmp en Err files 4966 tmp_files = [] 4967 err_files = [] 4968 4969 # DEBUG 4970 delete_tmp = True 4971 if self.get_config().get("verbosity", "warning") in ["debug"]: 4972 delete_tmp = False 4973 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4974 4975 # Config 4976 config = self.get_config() 4977 log.debug("Config: " + str(config)) 4978 4979 # Config - Folders - Databases 4980 databases_folders = ( 4981 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4982 ) 4983 log.debug("Databases annotations: " + str(databases_folders)) 4984 4985 # Config - annovar bin command 4986 annovar_bin_command = get_bin_command( 4987 bin="table_annovar.pl", 4988 tool="annovar", 4989 bin_type="perl", 4990 config=config, 4991 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4992 ) 4993 if not annovar_bin_command: 4994 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4995 log.error(msg_err) 4996 raise ValueError(msg_err) 4997 4998 # Config - BCFTools bin command 4999 bcftools_bin_command = get_bin_command( 5000 bin="bcftools", 5001 tool="bcftools", 5002 bin_type="bin", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5005 ) 5006 if not bcftools_bin_command: 5007 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - annovar databases 5012 annovar_databases = ( 5013 config.get("folders", {}) 5014 .get("databases", {}) 5015 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5016 ) 5017 annovar_databases = full_path(annovar_databases) 5018 if annovar_databases != "" and not os.path.exists(annovar_databases): 5019 os.makedirs(annovar_databases) 5020 5021 # Param 5022 param = self.get_param() 5023 log.debug("Param: " + str(param)) 5024 5025 # Param - options 5026 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5027 log.debug("Options: " + str(options)) 5028 5029 # Param - annotations 5030 annotations = ( 5031 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5032 ) 5033 log.debug("Annotations: " + str(annotations)) 5034 5035 # Param - Assembly 5036 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5037 5038 # Annovar database assembly 5039 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5040 if annovar_databases_assembly != "" and not os.path.exists( 5041 annovar_databases_assembly 5042 ): 5043 os.makedirs(annovar_databases_assembly) 5044 5045 # Data 5046 table_variants = self.get_table_variants() 5047 5048 # Check if not empty 5049 log.debug("Check if not empty") 5050 sql_query_chromosomes = ( 5051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5052 ) 5053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5054 if not sql_query_chromosomes_df["count"][0]: 5055 log.info(f"VCF empty") 5056 return 5057 5058 # VCF header 5059 vcf_reader = self.get_header() 5060 log.debug("Initial header: " + str(vcf_reader.infos)) 5061 5062 # Existing annotations 5063 for vcf_annotation in self.get_header().infos: 5064 5065 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5066 log.debug( 5067 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5068 ) 5069 5070 force_update_annotation = True 5071 5072 if annotations: 5073 5074 commands = [] 5075 tmp_annotates_vcf_name_list = [] 5076 5077 # Export in VCF 5078 log.debug("Create initial file to annotate") 5079 tmp_vcf = NamedTemporaryFile( 5080 prefix=self.get_prefix(), 5081 dir=self.get_tmp_dir(), 5082 suffix=".vcf.gz", 5083 delete=False, 5084 ) 5085 tmp_vcf_name = tmp_vcf.name 5086 tmp_files.append(tmp_vcf_name) 5087 tmp_files.append(tmp_vcf_name + ".tbi") 5088 5089 # Export VCF file 5090 self.export_variant_vcf( 5091 vcf_file=tmp_vcf_name, 5092 remove_info=".", 5093 add_samples=False, 5094 index=True, 5095 ) 5096 5097 # Create file for field rename 5098 log.debug("Create file for field rename") 5099 tmp_rename = NamedTemporaryFile( 5100 prefix=self.get_prefix(), 5101 dir=self.get_tmp_dir(), 5102 suffix=".rename", 5103 delete=False, 5104 ) 5105 tmp_rename_name = tmp_rename.name 5106 tmp_files.append(tmp_rename_name) 5107 5108 # Check Annovar database 5109 log.debug( 5110 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5111 ) 5112 databases_download_annovar( 5113 folder=annovar_databases, 5114 files=list(annotations.keys()), 5115 assemblies=[assembly], 5116 ) 5117 5118 for annotation in annotations: 5119 annotation_fields = annotations[annotation] 5120 5121 if not annotation_fields: 5122 annotation_fields = {"INFO": None} 5123 5124 log.info(f"Annotations Annovar - database '{annotation}'") 5125 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5126 5127 # Tmp file for annovar 5128 err_files = [] 5129 tmp_annotate_vcf_directory = TemporaryDirectory( 5130 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5131 ) 5132 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5133 tmp_annotate_vcf_name_annovar = ( 5134 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5135 ) 5136 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5137 err_files.append(tmp_annotate_vcf_name_err) 5138 tmp_files.append(tmp_annotate_vcf_name_err) 5139 5140 # Tmp file final vcf annotated by annovar 5141 tmp_annotate_vcf = NamedTemporaryFile( 5142 prefix=self.get_prefix(), 5143 dir=self.get_tmp_dir(), 5144 suffix=".vcf.gz", 5145 delete=False, 5146 ) 5147 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5148 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name) 5150 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5151 5152 # Number of fields 5153 annotation_list = [] 5154 annotation_renamed_list = [] 5155 5156 for annotation_field in annotation_fields: 5157 5158 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5159 annotation_fields_new_name = annotation_fields.get( 5160 annotation_field, annotation_field 5161 ) 5162 if not annotation_fields_new_name: 5163 annotation_fields_new_name = annotation_field 5164 5165 if ( 5166 force_update_annotation 5167 or annotation_fields_new_name not in self.get_header().infos 5168 ): 5169 annotation_list.append(annotation_field) 5170 annotation_renamed_list.append(annotation_fields_new_name) 5171 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5172 log.warning( 5173 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5174 ) 5175 5176 # Add rename info 5177 run_parallel_commands( 5178 [ 5179 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5180 ], 5181 1, 5182 ) 5183 5184 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5185 log.debug("annotation_list: " + str(annotation_list)) 5186 5187 # protocol 5188 protocol = annotation 5189 5190 # argument 5191 argument = "" 5192 5193 # operation 5194 operation = "f" 5195 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5196 "ensGene" 5197 ): 5198 operation = "g" 5199 if options.get("genebase", None): 5200 argument = f"""'{options.get("genebase","")}'""" 5201 elif annotation in ["cytoBand"]: 5202 operation = "r" 5203 5204 # argument option 5205 argument_option = "" 5206 if argument != "": 5207 argument_option = " --argument " + argument 5208 5209 # command options 5210 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5211 for option in options: 5212 if option not in ["genebase"]: 5213 command_options += f""" --{option}={options[option]}""" 5214 5215 # Command 5216 5217 # Command - Annovar 5218 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5219 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5220 5221 # Command - start pipe 5222 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5223 5224 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5225 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5226 5227 # Command - Special characters (refGene annotation) 5228 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5229 5230 # Command - Clean empty fields (with value ".") 5231 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5232 5233 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5234 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5235 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5236 # for ann in annotation_renamed_list: 5237 for ann in annotation_list: 5238 annovar_fields_to_keep.append(f"^INFO/{ann}") 5239 5240 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5241 5242 # Command - indexing 5243 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5244 5245 log.debug(f"Annotation - Annovar command: {command_annovar}") 5246 run_parallel_commands([command_annovar], 1) 5247 5248 # Error messages 5249 log.info(f"Error/Warning messages:") 5250 error_message_command_all = [] 5251 error_message_command_warning = [] 5252 error_message_command_err = [] 5253 for err_file in err_files: 5254 with open(err_file, "r") as f: 5255 for line in f: 5256 message = line.strip() 5257 error_message_command_all.append(message) 5258 if line.startswith("[W::") or line.startswith("WARNING"): 5259 error_message_command_warning.append(message) 5260 if line.startswith("[E::") or line.startswith("ERROR"): 5261 error_message_command_err.append( 5262 f"{err_file}: " + message 5263 ) 5264 # log info 5265 for message in list( 5266 set(error_message_command_err + error_message_command_warning) 5267 ): 5268 log.info(f" {message}") 5269 # debug info 5270 for message in list(set(error_message_command_all)): 5271 log.debug(f" {message}") 5272 # failed 5273 if len(error_message_command_err): 5274 log.error("Annotation failed: Error in commands") 5275 raise ValueError("Annotation failed: Error in commands") 5276 5277 if tmp_annotates_vcf_name_list: 5278 5279 # List of annotated files 5280 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5281 5282 # Tmp file 5283 tmp_annotate_vcf = NamedTemporaryFile( 5284 prefix=self.get_prefix(), 5285 dir=self.get_tmp_dir(), 5286 suffix=".vcf.gz", 5287 delete=False, 5288 ) 5289 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5290 tmp_files.append(tmp_annotate_vcf_name) 5291 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5292 err_files.append(tmp_annotate_vcf_name_err) 5293 tmp_files.append(tmp_annotate_vcf_name_err) 5294 5295 # Command merge 5296 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5297 log.info( 5298 f"Annotation Annovar - Annotation merging " 5299 + str(len(tmp_annotates_vcf_name_list)) 5300 + " annotated files" 5301 ) 5302 log.debug(f"Annotation - merge command: {merge_command}") 5303 run_parallel_commands([merge_command], 1) 5304 5305 # Find annotation in header 5306 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5307 header_list = self.read_vcf_header(f) 5308 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5309 5310 for ann in annovar_vcf_header.infos: 5311 if ann not in self.get_header().infos: 5312 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5313 5314 # Update variants 5315 log.info(f"Annotation Annovar - Updating...") 5316 self.update_from_vcf(tmp_annotate_vcf_name) 5317 5318 # Clean files 5319 # Tmp file remove command 5320 if True: 5321 tmp_files_remove_command = "" 5322 if tmp_files: 5323 tmp_files_remove_command = " ".join(tmp_files) 5324 clean_command = f" rm -f {tmp_files_remove_command} " 5325 log.debug(f"Annotation Annovar - Annotation cleaning ") 5326 log.debug(f"Annotation - cleaning command: {clean_command}") 5327 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5330 def annotation_parquet(self, threads: int = None) -> None: 5331 """ 5332 It takes a VCF file, and annotates it with a parquet file 5333 5334 :param threads: number of threads to use for the annotation 5335 :return: the value of the variable "result". 5336 """ 5337 5338 # DEBUG 5339 log.debug("Start annotation with parquet databases") 5340 5341 # Threads 5342 if not threads: 5343 threads = self.get_threads() 5344 log.debug("Threads: " + str(threads)) 5345 5346 # DEBUG 5347 delete_tmp = True 5348 if self.get_config().get("verbosity", "warning") in ["debug"]: 5349 delete_tmp = False 5350 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5351 5352 # Config 5353 databases_folders = set( 5354 self.get_config() 5355 .get("folders", {}) 5356 .get("databases", {}) 5357 .get("annotations", ["."]) 5358 + self.get_config() 5359 .get("folders", {}) 5360 .get("databases", {}) 5361 .get("parquet", ["."]) 5362 ) 5363 log.debug("Databases annotations: " + str(databases_folders)) 5364 5365 # Param 5366 annotations = ( 5367 self.get_param() 5368 .get("annotation", {}) 5369 .get("parquet", {}) 5370 .get("annotations", None) 5371 ) 5372 log.debug("Annotations: " + str(annotations)) 5373 5374 # Assembly 5375 assembly = self.get_param().get( 5376 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5377 ) 5378 5379 # Force Update Annotation 5380 force_update_annotation = ( 5381 self.get_param() 5382 .get("annotation", {}) 5383 .get("options", {}) 5384 .get("annotations_update", False) 5385 ) 5386 log.debug(f"force_update_annotation={force_update_annotation}") 5387 force_append_annotation = ( 5388 self.get_param() 5389 .get("annotation", {}) 5390 .get("options", {}) 5391 .get("annotations_append", False) 5392 ) 5393 log.debug(f"force_append_annotation={force_append_annotation}") 5394 5395 # Data 5396 table_variants = self.get_table_variants() 5397 5398 # Check if not empty 5399 log.debug("Check if not empty") 5400 sql_query_chromosomes_df = self.get_query_to_df( 5401 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5402 ) 5403 if not sql_query_chromosomes_df["count"][0]: 5404 log.info(f"VCF empty") 5405 return 5406 5407 # VCF header 5408 vcf_reader = self.get_header() 5409 log.debug("Initial header: " + str(vcf_reader.infos)) 5410 5411 # Nb Variants POS 5412 log.debug("NB Variants Start") 5413 nb_variants = self.conn.execute( 5414 f"SELECT count(*) AS count FROM variants" 5415 ).fetchdf()["count"][0] 5416 log.debug("NB Variants Stop") 5417 5418 # Existing annotations 5419 for vcf_annotation in self.get_header().infos: 5420 5421 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5422 log.debug( 5423 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5424 ) 5425 5426 # Added columns 5427 added_columns = [] 5428 5429 # drop indexes 5430 log.debug(f"Drop indexes...") 5431 self.drop_indexes() 5432 5433 if annotations: 5434 5435 if "ALL" in annotations: 5436 5437 all_param = annotations.get("ALL", {}) 5438 all_param_formats = all_param.get("formats", None) 5439 all_param_releases = all_param.get("releases", None) 5440 5441 databases_infos_dict = self.scan_databases( 5442 database_formats=all_param_formats, 5443 database_releases=all_param_releases, 5444 ) 5445 for database_infos in databases_infos_dict.keys(): 5446 if database_infos not in annotations: 5447 annotations[database_infos] = {"INFO": None} 5448 5449 for annotation in annotations: 5450 5451 if annotation in ["ALL"]: 5452 continue 5453 5454 # Annotation Name 5455 annotation_name = os.path.basename(annotation) 5456 5457 # Annotation fields 5458 annotation_fields = annotations[annotation] 5459 if not annotation_fields: 5460 annotation_fields = {"INFO": None} 5461 5462 log.debug(f"Annotation '{annotation_name}'") 5463 log.debug( 5464 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5465 ) 5466 5467 # Create Database 5468 database = Database( 5469 database=annotation, 5470 databases_folders=databases_folders, 5471 assembly=assembly, 5472 ) 5473 5474 # Find files 5475 parquet_file = database.get_database() 5476 parquet_hdr_file = database.get_header_file() 5477 parquet_type = database.get_type() 5478 5479 # Check if files exists 5480 if not parquet_file or not parquet_hdr_file: 5481 log.error("Annotation failed: file not found") 5482 raise ValueError("Annotation failed: file not found") 5483 else: 5484 # Get parquet connexion 5485 parquet_sql_attach = database.get_sql_database_attach( 5486 output="query" 5487 ) 5488 if parquet_sql_attach: 5489 self.conn.execute(parquet_sql_attach) 5490 parquet_file_link = database.get_sql_database_link() 5491 # Log 5492 log.debug( 5493 f"Annotation '{annotation_name}' - file: " 5494 + str(parquet_file) 5495 + " and " 5496 + str(parquet_hdr_file) 5497 ) 5498 5499 # Database full header columns 5500 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5501 parquet_hdr_file 5502 ) 5503 # Log 5504 log.debug( 5505 "Annotation database header columns : " 5506 + str(parquet_hdr_vcf_header_columns) 5507 ) 5508 5509 # Load header as VCF object 5510 parquet_hdr_vcf_header_infos = database.get_header().infos 5511 # Log 5512 log.debug( 5513 "Annotation database header: " 5514 + str(parquet_hdr_vcf_header_infos) 5515 ) 5516 5517 # Get extra infos 5518 parquet_columns = database.get_extra_columns() 5519 # Log 5520 log.debug("Annotation database Columns: " + str(parquet_columns)) 5521 5522 # Add extra columns if "ALL" in annotation_fields 5523 # if "ALL" in annotation_fields: 5524 # allow_add_extra_column = True 5525 if "ALL" in annotation_fields and database.get_extra_columns(): 5526 for extra_column in database.get_extra_columns(): 5527 if ( 5528 extra_column not in annotation_fields 5529 and extra_column.replace("INFO/", "") 5530 not in parquet_hdr_vcf_header_infos 5531 ): 5532 parquet_hdr_vcf_header_infos[extra_column] = ( 5533 vcf.parser._Info( 5534 extra_column, 5535 ".", 5536 "String", 5537 f"{extra_column} description", 5538 "unknown", 5539 "unknown", 5540 self.code_type_map["String"], 5541 ) 5542 ) 5543 5544 # For all fields in database 5545 annotation_fields_all = False 5546 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5547 annotation_fields_all = True 5548 annotation_fields = { 5549 key: key for key in parquet_hdr_vcf_header_infos 5550 } 5551 5552 log.debug( 5553 "Annotation database header - All annotations added: " 5554 + str(annotation_fields) 5555 ) 5556 5557 # Init 5558 5559 # List of annotation fields to use 5560 sql_query_annotation_update_info_sets = [] 5561 5562 # List of annotation to agregate 5563 sql_query_annotation_to_agregate = [] 5564 5565 # Number of fields 5566 nb_annotation_field = 0 5567 5568 # Annotation fields processed 5569 annotation_fields_processed = [] 5570 5571 # Columns mapping 5572 map_columns = database.map_columns( 5573 columns=annotation_fields, prefixes=["INFO/"] 5574 ) 5575 5576 # Query dict for fields to remove (update option) 5577 query_dict_remove = {} 5578 5579 # Fetch Anotation fields 5580 for annotation_field in annotation_fields: 5581 5582 # annotation_field_column 5583 annotation_field_column = map_columns.get( 5584 annotation_field, "INFO" 5585 ) 5586 5587 # field new name, if parametered 5588 annotation_fields_new_name = annotation_fields.get( 5589 annotation_field, annotation_field 5590 ) 5591 if not annotation_fields_new_name: 5592 annotation_fields_new_name = annotation_field 5593 5594 # To annotate 5595 # force_update_annotation = True 5596 # force_append_annotation = True 5597 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5598 if annotation_field in parquet_hdr_vcf_header_infos and ( 5599 force_update_annotation 5600 or force_append_annotation 5601 or ( 5602 annotation_fields_new_name 5603 not in self.get_header().infos 5604 ) 5605 ): 5606 5607 # Add field to annotation to process list 5608 annotation_fields_processed.append( 5609 annotation_fields_new_name 5610 ) 5611 5612 # explode infos for the field 5613 annotation_fields_new_name_info_msg = "" 5614 if ( 5615 force_update_annotation 5616 and annotation_fields_new_name 5617 in self.get_header().infos 5618 ): 5619 # Remove field from INFO 5620 query = f""" 5621 UPDATE {table_variants} as table_variants 5622 SET INFO = REGEXP_REPLACE( 5623 concat(table_variants.INFO,''), 5624 ';*{annotation_fields_new_name}=[^;]*', 5625 '' 5626 ) 5627 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5628 """ 5629 annotation_fields_new_name_info_msg = " [update]" 5630 query_dict_remove[ 5631 f"remove 'INFO/{annotation_fields_new_name}'" 5632 ] = query 5633 5634 # Sep between fields in INFO 5635 nb_annotation_field += 1 5636 if nb_annotation_field > 1: 5637 annotation_field_sep = ";" 5638 else: 5639 annotation_field_sep = "" 5640 5641 log.info( 5642 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5643 ) 5644 5645 # Add INFO field to header 5646 parquet_hdr_vcf_header_infos_number = ( 5647 parquet_hdr_vcf_header_infos[annotation_field].num 5648 or "." 5649 ) 5650 parquet_hdr_vcf_header_infos_type = ( 5651 parquet_hdr_vcf_header_infos[annotation_field].type 5652 or "String" 5653 ) 5654 parquet_hdr_vcf_header_infos_description = ( 5655 parquet_hdr_vcf_header_infos[annotation_field].desc 5656 or f"{annotation_field} description" 5657 ) 5658 parquet_hdr_vcf_header_infos_source = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].source 5660 or "unknown" 5661 ) 5662 parquet_hdr_vcf_header_infos_version = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].version 5664 or "unknown" 5665 ) 5666 5667 vcf_reader.infos[annotation_fields_new_name] = ( 5668 vcf.parser._Info( 5669 annotation_fields_new_name, 5670 parquet_hdr_vcf_header_infos_number, 5671 parquet_hdr_vcf_header_infos_type, 5672 parquet_hdr_vcf_header_infos_description, 5673 parquet_hdr_vcf_header_infos_source, 5674 parquet_hdr_vcf_header_infos_version, 5675 self.code_type_map[ 5676 parquet_hdr_vcf_header_infos_type 5677 ], 5678 ) 5679 ) 5680 5681 # Append 5682 if force_append_annotation: 5683 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5684 else: 5685 query_case_when_append = "" 5686 5687 # Annotation/Update query fields 5688 # Found in INFO column 5689 if ( 5690 annotation_field_column == "INFO" 5691 and "INFO" in parquet_hdr_vcf_header_columns 5692 ): 5693 sql_query_annotation_update_info_sets.append( 5694 f""" 5695 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5696 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5697 ELSE '' 5698 END 5699 """ 5700 ) 5701 # Found in a specific column 5702 else: 5703 sql_query_annotation_update_info_sets.append( 5704 f""" 5705 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5706 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5707 ELSE '' 5708 END 5709 """ 5710 ) 5711 sql_query_annotation_to_agregate.append( 5712 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5713 ) 5714 5715 # Not to annotate 5716 else: 5717 5718 if force_update_annotation: 5719 annotation_message = "forced" 5720 else: 5721 annotation_message = "skipped" 5722 5723 if annotation_field not in parquet_hdr_vcf_header_infos: 5724 log.warning( 5725 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5726 ) 5727 if annotation_fields_new_name in self.get_header().infos: 5728 log.warning( 5729 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5730 ) 5731 5732 # Check if ALL fields have to be annotated. Thus concat all INFO field 5733 # allow_annotation_full_info = True 5734 allow_annotation_full_info = not force_append_annotation 5735 5736 if parquet_type in ["regions"]: 5737 allow_annotation_full_info = False 5738 5739 if ( 5740 allow_annotation_full_info 5741 and nb_annotation_field == len(annotation_fields) 5742 and annotation_fields_all 5743 and ( 5744 "INFO" in parquet_hdr_vcf_header_columns 5745 and "INFO" in database.get_extra_columns() 5746 ) 5747 ): 5748 log.debug("Column INFO annotation enabled") 5749 sql_query_annotation_update_info_sets = [] 5750 sql_query_annotation_update_info_sets.append( 5751 f" table_parquet.INFO " 5752 ) 5753 5754 if sql_query_annotation_update_info_sets: 5755 5756 # Annotate 5757 log.info(f"Annotation '{annotation_name}' - Annotation...") 5758 5759 # Join query annotation update info sets for SQL 5760 sql_query_annotation_update_info_sets_sql = ",".join( 5761 sql_query_annotation_update_info_sets 5762 ) 5763 5764 # Check chromosomes list (and variants infos) 5765 sql_query_chromosomes = f""" 5766 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5767 FROM {table_variants} as table_variants 5768 GROUP BY table_variants."#CHROM" 5769 ORDER BY table_variants."#CHROM" 5770 """ 5771 sql_query_chromosomes_df = self.conn.execute( 5772 sql_query_chromosomes 5773 ).df() 5774 sql_query_chromosomes_dict = { 5775 entry["CHROM"]: { 5776 "count": entry["count_variants"], 5777 "min": entry["min_variants"], 5778 "max": entry["max_variants"], 5779 } 5780 for index, entry in sql_query_chromosomes_df.iterrows() 5781 } 5782 5783 # Init 5784 nb_of_query = 0 5785 nb_of_variant_annotated = 0 5786 query_dict = query_dict_remove 5787 5788 # for chrom in sql_query_chromosomes_df["CHROM"]: 5789 for chrom in sql_query_chromosomes_dict: 5790 5791 # Number of variant by chromosome 5792 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5793 chrom, {} 5794 ).get("count", 0) 5795 5796 log.debug( 5797 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5798 ) 5799 5800 # Annotation with regions database 5801 if parquet_type in ["regions"]: 5802 sql_query_annotation_from_clause = f""" 5803 FROM ( 5804 SELECT 5805 '{chrom}' AS \"#CHROM\", 5806 table_variants_from.\"POS\" AS \"POS\", 5807 {",".join(sql_query_annotation_to_agregate)} 5808 FROM {table_variants} as table_variants_from 5809 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5810 table_parquet_from."#CHROM" = '{chrom}' 5811 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5812 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5813 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5814 ) 5815 ) 5816 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5817 GROUP BY table_variants_from.\"POS\" 5818 ) 5819 as table_parquet 5820 """ 5821 5822 sql_query_annotation_where_clause = """ 5823 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5824 AND table_parquet.\"POS\" = table_variants.\"POS\" 5825 """ 5826 5827 # Annotation with variants database 5828 else: 5829 sql_query_annotation_from_clause = f""" 5830 FROM {parquet_file_link} as table_parquet 5831 """ 5832 sql_query_annotation_where_clause = f""" 5833 table_variants."#CHROM" = '{chrom}' 5834 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5835 AND table_parquet.\"POS\" = table_variants.\"POS\" 5836 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5837 AND table_parquet.\"REF\" = table_variants.\"REF\" 5838 """ 5839 5840 # Create update query 5841 sql_query_annotation_chrom_interval_pos = f""" 5842 UPDATE {table_variants} as table_variants 5843 SET INFO = 5844 concat( 5845 CASE WHEN table_variants.INFO NOT IN ('','.') 5846 THEN table_variants.INFO 5847 ELSE '' 5848 END 5849 , 5850 CASE WHEN table_variants.INFO NOT IN ('','.') 5851 AND ( 5852 concat({sql_query_annotation_update_info_sets_sql}) 5853 ) 5854 NOT IN ('','.') 5855 THEN ';' 5856 ELSE '' 5857 END 5858 , 5859 {sql_query_annotation_update_info_sets_sql} 5860 ) 5861 {sql_query_annotation_from_clause} 5862 WHERE {sql_query_annotation_where_clause} 5863 ; 5864 """ 5865 5866 # Add update query to dict 5867 query_dict[ 5868 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5869 ] = sql_query_annotation_chrom_interval_pos 5870 5871 nb_of_query = len(query_dict) 5872 num_query = 0 5873 5874 # SET max_expression_depth TO x 5875 self.conn.execute("SET max_expression_depth TO 10000") 5876 5877 for query_name in query_dict: 5878 query = query_dict[query_name] 5879 num_query += 1 5880 log.info( 5881 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5882 ) 5883 result = self.conn.execute(query) 5884 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5885 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5886 log.info( 5887 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5888 ) 5889 5890 log.info( 5891 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5892 ) 5893 5894 else: 5895 5896 log.info( 5897 f"Annotation '{annotation_name}' - No Annotations available" 5898 ) 5899 5900 log.debug("Final header: " + str(vcf_reader.infos)) 5901 5902 # Remove added columns 5903 for added_column in added_columns: 5904 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
5906 def annotation_splice(self, threads: int = None) -> None: 5907 """ 5908 This function annotate with snpEff 5909 5910 :param threads: The number of threads to use 5911 :return: the value of the variable "return_value". 5912 """ 5913 5914 # DEBUG 5915 log.debug("Start annotation with splice tools") 5916 5917 # Threads 5918 if not threads: 5919 threads = self.get_threads() 5920 log.debug("Threads: " + str(threads)) 5921 5922 # DEBUG 5923 delete_tmp = True 5924 if self.get_config().get("verbosity", "warning") in ["debug"]: 5925 delete_tmp = False 5926 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5927 5928 # Config 5929 config = self.get_config() 5930 log.debug("Config: " + str(config)) 5931 splice_config = config.get("tools", {}).get("splice", {}) 5932 if not splice_config: 5933 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5934 if not splice_config: 5935 msg_err = "No Splice tool config" 5936 log.error(msg_err) 5937 raise ValueError(msg_err) 5938 log.debug(f"splice_config={splice_config}") 5939 5940 # Config - Folders - Databases 5941 databases_folders = ( 5942 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5943 ) 5944 log.debug("Databases annotations: " + str(databases_folders)) 5945 5946 # Splice docker image 5947 splice_docker_image = splice_config.get("docker").get("image") 5948 5949 # Pull splice image if it's not already there 5950 if not check_docker_image_exists(splice_docker_image): 5951 log.warning( 5952 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5953 ) 5954 try: 5955 command(f"docker pull {splice_config.get('docker').get('image')}") 5956 except subprocess.CalledProcessError: 5957 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5958 log.error(msg_err) 5959 raise ValueError(msg_err) 5960 return None 5961 5962 # Config - splice databases 5963 splice_databases = ( 5964 config.get("folders", {}) 5965 .get("databases", {}) 5966 .get("splice", DEFAULT_SPLICE_FOLDER) 5967 ) 5968 splice_databases = full_path(splice_databases) 5969 5970 # Param 5971 param = self.get_param() 5972 log.debug("Param: " + str(param)) 5973 5974 # Param 5975 options = param.get("annotation", {}).get("splice", {}) 5976 log.debug("Options: " + str(options)) 5977 5978 # Data 5979 table_variants = self.get_table_variants() 5980 5981 # Check if not empty 5982 log.debug("Check if not empty") 5983 sql_query_chromosomes = ( 5984 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5985 ) 5986 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5987 log.info("VCF empty") 5988 return None 5989 5990 # Export in VCF 5991 log.debug("Create initial file to annotate") 5992 5993 # Create output folder 5994 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5995 if not os.path.exists(output_folder): 5996 Path(output_folder).mkdir(parents=True, exist_ok=True) 5997 5998 # Create tmp VCF file 5999 tmp_vcf = NamedTemporaryFile( 6000 prefix=self.get_prefix(), 6001 dir=output_folder, 6002 suffix=".vcf", 6003 delete=False, 6004 ) 6005 tmp_vcf_name = tmp_vcf.name 6006 6007 # VCF header 6008 header = self.get_header() 6009 6010 # Existing annotations 6011 for vcf_annotation in self.get_header().infos: 6012 6013 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6014 log.debug( 6015 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6016 ) 6017 6018 # Memory limit 6019 if config.get("memory", None): 6020 memory_limit = config.get("memory", "8G").upper() 6021 # upper() 6022 else: 6023 memory_limit = "8G" 6024 log.debug(f"memory_limit: {memory_limit}") 6025 6026 # Check number of variants to annotate 6027 where_clause_regex_spliceai = r"SpliceAI_\w+" 6028 where_clause_regex_spip = r"SPiP_\w+" 6029 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6030 df_list_of_variants_to_annotate = self.get_query_to_df( 6031 query=f""" SELECT * FROM variants {where_clause} """ 6032 ) 6033 if len(df_list_of_variants_to_annotate) == 0: 6034 log.warning( 6035 f"No variants to annotate with splice. Variants probably already annotated with splice" 6036 ) 6037 return None 6038 else: 6039 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6040 6041 # Export VCF file 6042 self.export_variant_vcf( 6043 vcf_file=tmp_vcf_name, 6044 remove_info=True, 6045 add_samples=True, 6046 index=False, 6047 where_clause=where_clause, 6048 ) 6049 6050 # Create docker container and launch splice analysis 6051 if splice_config: 6052 6053 # Splice mount folders 6054 mount_folders = splice_config.get("mount", {}) 6055 6056 # Genome mount 6057 mount_folders[ 6058 config.get("folders", {}) 6059 .get("databases", {}) 6060 .get("genomes", DEFAULT_GENOME_FOLDER) 6061 ] = "ro" 6062 6063 # SpliceAI mount 6064 mount_folders[ 6065 config.get("folders", {}) 6066 .get("databases", {}) 6067 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6068 ] = "ro" 6069 6070 # Genome mount 6071 mount_folders[ 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spip", DEFAULT_SPIP_FOLDER) 6075 ] = "ro" 6076 6077 # Mount folders 6078 mount = [] 6079 6080 # Config mount 6081 mount = [ 6082 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6083 for path, mode in mount_folders.items() 6084 ] 6085 6086 if any(value for value in splice_config.values() if value is None): 6087 log.warning("At least one splice config parameter is empty") 6088 return None 6089 6090 # Params in splice nf 6091 def check_values(dico: dict): 6092 """ 6093 Ensure parameters for NF splice pipeline 6094 """ 6095 for key, val in dico.items(): 6096 if key == "genome": 6097 if any( 6098 assemb in options.get("genome", {}) 6099 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6100 ): 6101 yield f"--{key} hg19" 6102 elif any( 6103 assemb in options.get("genome", {}) 6104 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6105 ): 6106 yield f"--{key} hg38" 6107 elif ( 6108 (isinstance(val, str) and val) 6109 or isinstance(val, int) 6110 or isinstance(val, bool) 6111 ): 6112 yield f"--{key} {val}" 6113 6114 # Genome 6115 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6116 options["genome"] = genome 6117 6118 # NF params 6119 nf_params = [] 6120 6121 # Add options 6122 if options: 6123 nf_params = list(check_values(options)) 6124 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6125 else: 6126 log.debug("No NF params provided") 6127 6128 # Add threads 6129 if "threads" not in options.keys(): 6130 nf_params.append(f"--threads {threads}") 6131 6132 # Genome path 6133 genome_path = find_genome( 6134 config.get("folders", {}) 6135 .get("databases", {}) 6136 .get("genomes", DEFAULT_GENOME_FOLDER), 6137 file=f"{genome}.fa", 6138 ) 6139 # Add genome path 6140 if not genome_path: 6141 raise ValueError( 6142 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6143 ) 6144 else: 6145 log.debug(f"Genome: {genome_path}") 6146 nf_params.append(f"--genome_path {genome_path}") 6147 6148 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6149 """ 6150 Setting up updated databases for SPiP and SpliceAI 6151 """ 6152 6153 try: 6154 6155 # SpliceAI assembly transcriptome 6156 spliceai_assembly = os.path.join( 6157 config.get("folders", {}) 6158 .get("databases", {}) 6159 .get("spliceai", {}), 6160 options.get("genome"), 6161 "transcriptome", 6162 ) 6163 spip_assembly = options.get("genome") 6164 6165 spip = find( 6166 f"transcriptome_{spip_assembly}.RData", 6167 config.get("folders", {}).get("databases", {}).get("spip", {}), 6168 ) 6169 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6170 log.debug(f"SPiP annotations: {spip}") 6171 log.debug(f"SpliceAI annotations: {spliceai}") 6172 if spip and spliceai: 6173 return [ 6174 f"--spip_transcriptome {spip}", 6175 f"--spliceai_annotations {spliceai}", 6176 ] 6177 else: 6178 # TODO crash and go on with basic annotations ? 6179 # raise ValueError( 6180 # "Can't find splice databases in configuration EXIT" 6181 # ) 6182 log.warning( 6183 "Can't find splice databases in configuration, use annotations file from image" 6184 ) 6185 except TypeError: 6186 log.warning( 6187 "Can't find splice databases in configuration, use annotations file from image" 6188 ) 6189 return [] 6190 6191 # Add options, check if transcriptome option have already beend provided 6192 if ( 6193 "spip_transcriptome" not in nf_params 6194 and "spliceai_transcriptome" not in nf_params 6195 ): 6196 splice_reference = splice_annotations(options, config) 6197 if splice_reference: 6198 nf_params.extend(splice_reference) 6199 6200 nf_params.append(f"--output_folder {output_folder}") 6201 6202 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6203 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6204 log.debug(cmd) 6205 6206 splice_config["docker"]["command"] = cmd 6207 6208 docker_cmd = get_bin_command( 6209 tool="splice", 6210 bin_type="docker", 6211 config=config, 6212 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6213 add_options=f"--name {random_uuid} {' '.join(mount)}", 6214 ) 6215 6216 # Docker debug 6217 # if splice_config.get("rm_container"): 6218 # rm_container = "--rm" 6219 # else: 6220 # rm_container = "" 6221 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6222 6223 log.debug(docker_cmd) 6224 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6225 log.debug(res.stdout) 6226 if res.stderr: 6227 log.error(res.stderr) 6228 res.check_returncode() 6229 else: 6230 log.warning(f"Splice tool configuration not found: {config}") 6231 6232 # Update variants 6233 log.info("Annotation - Updating...") 6234 # Test find output vcf 6235 log.debug( 6236 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6237 ) 6238 output_vcf = [] 6239 # Wrong folder to look in 6240 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6241 if ( 6242 files 6243 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6244 ): 6245 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6246 # log.debug(os.listdir(options.get("output_folder"))) 6247 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6248 if not output_vcf: 6249 log.debug( 6250 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6251 ) 6252 else: 6253 # Get new header from annotated vcf 6254 log.debug(f"Initial header: {len(header.infos)} fields") 6255 # Create new header with splice infos 6256 new_vcf = Variants(input=output_vcf[0]) 6257 new_vcf_header = new_vcf.get_header().infos 6258 for keys, infos in new_vcf_header.items(): 6259 if keys not in header.infos.keys(): 6260 header.infos[keys] = infos 6261 log.debug(f"New header: {len(header.infos)} fields") 6262 log.debug(f"Splice tmp output: {output_vcf[0]}") 6263 self.update_from_vcf(output_vcf[0]) 6264 6265 # Remove folder 6266 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6272 def get_config_default(self, name: str) -> dict: 6273 """ 6274 The function `get_config_default` returns a dictionary containing default configurations for 6275 various calculations and prioritizations. 6276 6277 :param name: The `get_config_default` function returns a dictionary containing default 6278 configurations for different calculations and prioritizations. The `name` parameter is used to 6279 specify which specific configuration to retrieve from the dictionary 6280 :type name: str 6281 :return: The function `get_config_default` returns a dictionary containing default configuration 6282 settings for different calculations and prioritizations. The specific configuration settings are 6283 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6284 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6285 returned. If there is no match, an empty dictionary is returned. 6286 """ 6287 6288 config_default = { 6289 "calculations": { 6290 "variant_chr_pos_alt_ref": { 6291 "type": "sql", 6292 "name": "variant_chr_pos_alt_ref", 6293 "description": "Create a variant ID with chromosome, position, alt and ref", 6294 "available": False, 6295 "output_column_name": "variant_chr_pos_alt_ref", 6296 "output_column_type": "String", 6297 "output_column_description": "variant ID with chromosome, position, alt and ref", 6298 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6299 "operation_info": True, 6300 }, 6301 "VARTYPE": { 6302 "type": "sql", 6303 "name": "VARTYPE", 6304 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6305 "available": True, 6306 "output_column_name": "VARTYPE", 6307 "output_column_type": "String", 6308 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6309 "operation_query": """ 6310 CASE 6311 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6312 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6313 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6314 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6315 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6316 ELSE 'UNDEFINED' 6317 END 6318 """, 6319 "info_fields": ["SVTYPE"], 6320 "operation_info": True, 6321 }, 6322 "snpeff_hgvs": { 6323 "type": "python", 6324 "name": "snpeff_hgvs", 6325 "description": "HGVS nomenclatures from snpEff annotation", 6326 "available": True, 6327 "function_name": "calculation_extract_snpeff_hgvs", 6328 "function_params": ["snpeff_hgvs", "ANN"], 6329 }, 6330 "snpeff_ann_explode": { 6331 "type": "python", 6332 "name": "snpeff_ann_explode", 6333 "description": "Explode snpEff annotations with uniquify values", 6334 "available": True, 6335 "function_name": "calculation_snpeff_ann_explode", 6336 "function_params": [False, "fields", "snpeff_", "ANN"], 6337 }, 6338 "snpeff_ann_explode_uniquify": { 6339 "type": "python", 6340 "name": "snpeff_ann_explode_uniquify", 6341 "description": "Explode snpEff annotations", 6342 "available": True, 6343 "function_name": "calculation_snpeff_ann_explode", 6344 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6345 }, 6346 "snpeff_ann_explode_json": { 6347 "type": "python", 6348 "name": "snpeff_ann_explode_json", 6349 "description": "Explode snpEff annotations in JSON format", 6350 "available": True, 6351 "function_name": "calculation_snpeff_ann_explode", 6352 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6353 }, 6354 "NOMEN": { 6355 "type": "python", 6356 "name": "NOMEN", 6357 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6358 "available": True, 6359 "function_name": "calculation_extract_nomen", 6360 "function_params": [], 6361 }, 6362 "FINDBYPIPELINE": { 6363 "type": "python", 6364 "name": "FINDBYPIPELINE", 6365 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6366 "available": True, 6367 "function_name": "calculation_find_by_pipeline", 6368 "function_params": ["findbypipeline"], 6369 }, 6370 "FINDBYSAMPLE": { 6371 "type": "python", 6372 "name": "FINDBYSAMPLE", 6373 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6374 "available": True, 6375 "function_name": "calculation_find_by_pipeline", 6376 "function_params": ["findbysample"], 6377 }, 6378 "GENOTYPECONCORDANCE": { 6379 "type": "python", 6380 "name": "GENOTYPECONCORDANCE", 6381 "description": "Concordance of genotype for multi caller VCF", 6382 "available": True, 6383 "function_name": "calculation_genotype_concordance", 6384 "function_params": [], 6385 }, 6386 "BARCODE": { 6387 "type": "python", 6388 "name": "BARCODE", 6389 "description": "BARCODE as VaRank tool", 6390 "available": True, 6391 "function_name": "calculation_barcode", 6392 "function_params": [], 6393 }, 6394 "BARCODEFAMILY": { 6395 "type": "python", 6396 "name": "BARCODEFAMILY", 6397 "description": "BARCODEFAMILY as VaRank tool", 6398 "available": True, 6399 "function_name": "calculation_barcode_family", 6400 "function_params": ["BCF"], 6401 }, 6402 "TRIO": { 6403 "type": "python", 6404 "name": "TRIO", 6405 "description": "Inheritance for a trio family", 6406 "available": True, 6407 "function_name": "calculation_trio", 6408 "function_params": [], 6409 }, 6410 "VAF": { 6411 "type": "python", 6412 "name": "VAF", 6413 "description": "Variant Allele Frequency (VAF) harmonization", 6414 "available": True, 6415 "function_name": "calculation_vaf_normalization", 6416 "function_params": [], 6417 }, 6418 "VAF_stats": { 6419 "type": "python", 6420 "name": "VAF_stats", 6421 "description": "Variant Allele Frequency (VAF) statistics", 6422 "available": True, 6423 "function_name": "calculation_genotype_stats", 6424 "function_params": ["VAF"], 6425 }, 6426 "DP_stats": { 6427 "type": "python", 6428 "name": "DP_stats", 6429 "description": "Depth (DP) statistics", 6430 "available": True, 6431 "function_name": "calculation_genotype_stats", 6432 "function_params": ["DP"], 6433 }, 6434 "variant_id": { 6435 "type": "python", 6436 "name": "variant_id", 6437 "description": "Variant ID generated from variant position and type", 6438 "available": True, 6439 "function_name": "calculation_variant_id", 6440 "function_params": [], 6441 }, 6442 "transcripts_json": { 6443 "type": "python", 6444 "name": "transcripts_json", 6445 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6446 "available": True, 6447 "function_name": "calculation_transcripts_json", 6448 "function_params": ["transcripts_json"], 6449 }, 6450 }, 6451 "prioritizations": { 6452 "default": { 6453 "filter": [ 6454 { 6455 "type": "notequals", 6456 "value": "!PASS|\\.", 6457 "score": 0, 6458 "flag": "FILTERED", 6459 "comment": ["Bad variant quality"], 6460 }, 6461 { 6462 "type": "equals", 6463 "value": "REJECT", 6464 "score": -20, 6465 "flag": "PASS", 6466 "comment": ["Bad variant quality"], 6467 }, 6468 ], 6469 "DP": [ 6470 { 6471 "type": "gte", 6472 "value": "50", 6473 "score": 5, 6474 "flag": "PASS", 6475 "comment": ["DP higher than 50"], 6476 } 6477 ], 6478 "ANN": [ 6479 { 6480 "type": "contains", 6481 "value": "HIGH", 6482 "score": 5, 6483 "flag": "PASS", 6484 "comment": [ 6485 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6486 ], 6487 }, 6488 { 6489 "type": "contains", 6490 "value": "MODERATE", 6491 "score": 3, 6492 "flag": "PASS", 6493 "comment": [ 6494 "A non-disruptive variant that might change protein effectiveness" 6495 ], 6496 }, 6497 { 6498 "type": "contains", 6499 "value": "LOW", 6500 "score": 0, 6501 "flag": "FILTERED", 6502 "comment": [ 6503 "Assumed to be mostly harmless or unlikely to change protein behavior" 6504 ], 6505 }, 6506 { 6507 "type": "contains", 6508 "value": "MODIFIER", 6509 "score": 0, 6510 "flag": "FILTERED", 6511 "comment": [ 6512 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6513 ], 6514 }, 6515 ], 6516 } 6517 }, 6518 } 6519 6520 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6522 def get_config_json( 6523 self, name: str, config_dict: dict = {}, config_file: str = None 6524 ) -> dict: 6525 """ 6526 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6527 default values, a dictionary, and a file. 6528 6529 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6530 the name of the configuration. It is used to identify and retrieve the configuration settings 6531 for a specific component or module 6532 :type name: str 6533 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6534 dictionary that allows you to provide additional configuration settings or overrides. When you 6535 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6536 the key is the configuration setting you want to override or 6537 :type config_dict: dict 6538 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6539 specify the path to a configuration file that contains additional settings. If provided, the 6540 function will read the contents of this file and update the configuration dictionary with the 6541 values found in the file, overriding any existing values with the 6542 :type config_file: str 6543 :return: The function `get_config_json` returns a dictionary containing the configuration 6544 settings. 6545 """ 6546 6547 # Create with default prioritizations 6548 config_default = self.get_config_default(name=name) 6549 configuration = config_default 6550 # log.debug(f"configuration={configuration}") 6551 6552 # Replace prioritizations from dict 6553 for config in config_dict: 6554 configuration[config] = config_dict[config] 6555 6556 # Replace prioritizations from file 6557 config_file = full_path(config_file) 6558 if config_file: 6559 if os.path.exists(config_file): 6560 with open(config_file) as config_file_content: 6561 config_file_dict = json.load(config_file_content) 6562 for config in config_file_dict: 6563 configuration[config] = config_file_dict[config] 6564 else: 6565 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6566 log.error(msg_error) 6567 raise ValueError(msg_error) 6568 6569 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6571 def prioritization(self) -> None: 6572 """ 6573 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6574 INFO fields 6575 """ 6576 6577 # Config 6578 config = self.get_config() 6579 6580 # Param 6581 param = self.get_param() 6582 6583 # Quick Prioritizations 6584 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6585 6586 # Configuration profiles 6587 prioritization_config_file = param.get("prioritization", {}).get( 6588 "prioritization_config", None 6589 ) 6590 prioritization_config_file = full_path(prioritization_config_file) 6591 prioritizations_config = self.get_config_json( 6592 name="prioritizations", config_file=prioritization_config_file 6593 ) 6594 6595 # Prioritization options 6596 profiles = param.get("prioritization", {}).get("profiles", []) 6597 if isinstance(profiles, str): 6598 profiles = profiles.split(",") 6599 pzfields = param.get("prioritization", {}).get( 6600 "pzfields", ["PZFlag", "PZScore"] 6601 ) 6602 if isinstance(pzfields, str): 6603 pzfields = pzfields.split(",") 6604 default_profile = param.get("prioritization", {}).get("default_profile", None) 6605 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6606 prioritization_score_mode = param.get("prioritization", {}).get( 6607 "prioritization_score_mode", "HOWARD" 6608 ) 6609 6610 # Quick Prioritizations 6611 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6612 prioritizations = param.get("prioritizations", None) 6613 if prioritizations: 6614 log.info("Quick Prioritization:") 6615 for profile in prioritizations.split(","): 6616 if profile not in profiles: 6617 profiles.append(profile) 6618 log.info(f" {profile}") 6619 6620 # If profile "ALL" provided, all profiles in the config profiles 6621 if "ALL" in profiles: 6622 profiles = list(prioritizations_config.keys()) 6623 6624 for profile in profiles: 6625 if prioritizations_config.get(profile, None): 6626 log.debug(f"Profile '{profile}' configured") 6627 else: 6628 msg_error = f"Profile '{profile}' NOT configured" 6629 log.error(msg_error) 6630 raise ValueError(msg_error) 6631 6632 if profiles: 6633 log.info(f"Prioritization... ") 6634 else: 6635 log.debug(f"No profile defined") 6636 return 6637 6638 if not default_profile and len(profiles): 6639 default_profile = profiles[0] 6640 6641 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6642 log.debug("Profiles to check: " + str(list(profiles))) 6643 6644 # Variables 6645 table_variants = self.get_table_variants(clause="update") 6646 6647 # Added columns 6648 added_columns = [] 6649 6650 # Create list of PZfields 6651 # List of PZFields 6652 list_of_pzfields_original = pzfields + [ 6653 pzfield + pzfields_sep + profile 6654 for pzfield in pzfields 6655 for profile in profiles 6656 ] 6657 list_of_pzfields = [] 6658 log.debug(f"{list_of_pzfields_original}") 6659 6660 # Remove existing PZfields to use if exists 6661 for pzfield in list_of_pzfields_original: 6662 if self.get_header().infos.get(pzfield, None) is None: 6663 list_of_pzfields.append(pzfield) 6664 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6665 else: 6666 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6667 6668 if list_of_pzfields: 6669 6670 # Explode Infos fields 6671 explode_infos_prefix = self.get_explode_infos_prefix() 6672 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6673 extra_infos = self.get_extra_infos() 6674 6675 # PZfields tags description 6676 PZfields_INFOS = { 6677 "PZTags": { 6678 "ID": "PZTags", 6679 "Number": ".", 6680 "Type": "String", 6681 "Description": "Variant tags based on annotation criteria", 6682 }, 6683 "PZScore": { 6684 "ID": "PZScore", 6685 "Number": 1, 6686 "Type": "Integer", 6687 "Description": "Variant score based on annotation criteria", 6688 }, 6689 "PZFlag": { 6690 "ID": "PZFlag", 6691 "Number": 1, 6692 "Type": "String", 6693 "Description": "Variant flag based on annotation criteria", 6694 }, 6695 "PZComment": { 6696 "ID": "PZComment", 6697 "Number": ".", 6698 "Type": "String", 6699 "Description": "Variant comment based on annotation criteria", 6700 }, 6701 "PZInfos": { 6702 "ID": "PZInfos", 6703 "Number": ".", 6704 "Type": "String", 6705 "Description": "Variant infos based on annotation criteria", 6706 }, 6707 } 6708 6709 # Create INFO fields if not exist 6710 for field in PZfields_INFOS: 6711 field_ID = PZfields_INFOS[field]["ID"] 6712 field_description = PZfields_INFOS[field]["Description"] 6713 if field_ID not in self.get_header().infos and field_ID in pzfields: 6714 field_description = ( 6715 PZfields_INFOS[field]["Description"] 6716 + f", profile {default_profile}" 6717 ) 6718 self.get_header().infos[field_ID] = vcf.parser._Info( 6719 field_ID, 6720 PZfields_INFOS[field]["Number"], 6721 PZfields_INFOS[field]["Type"], 6722 field_description, 6723 "unknown", 6724 "unknown", 6725 code_type_map[PZfields_INFOS[field]["Type"]], 6726 ) 6727 6728 # Create INFO fields if not exist for each profile 6729 for profile in prioritizations_config: 6730 if profile in profiles or profiles == []: 6731 for field in PZfields_INFOS: 6732 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6733 field_description = ( 6734 PZfields_INFOS[field]["Description"] 6735 + f", profile {profile}" 6736 ) 6737 if ( 6738 field_ID not in self.get_header().infos 6739 and field in pzfields 6740 ): 6741 self.get_header().infos[field_ID] = vcf.parser._Info( 6742 field_ID, 6743 PZfields_INFOS[field]["Number"], 6744 PZfields_INFOS[field]["Type"], 6745 field_description, 6746 "unknown", 6747 "unknown", 6748 code_type_map[PZfields_INFOS[field]["Type"]], 6749 ) 6750 6751 # Header 6752 for pzfield in list_of_pzfields: 6753 if re.match("PZScore.*", pzfield): 6754 added_column = self.add_column( 6755 table_name=table_variants, 6756 column_name=pzfield, 6757 column_type="INTEGER", 6758 default_value="0", 6759 ) 6760 elif re.match("PZFlag.*", pzfield): 6761 added_column = self.add_column( 6762 table_name=table_variants, 6763 column_name=pzfield, 6764 column_type="BOOLEAN", 6765 default_value="1", 6766 ) 6767 else: 6768 added_column = self.add_column( 6769 table_name=table_variants, 6770 column_name=pzfield, 6771 column_type="STRING", 6772 default_value="''", 6773 ) 6774 added_columns.append(added_column) 6775 6776 # Profiles 6777 if profiles: 6778 6779 # foreach profile in configuration file 6780 for profile in prioritizations_config: 6781 6782 # If profile is asked in param, or ALL are asked (empty profile []) 6783 if profile in profiles or profiles == []: 6784 log.info(f"Profile '{profile}'") 6785 6786 sql_set_info_option = "" 6787 6788 sql_set_info = [] 6789 6790 # PZ fields set 6791 6792 # PZScore 6793 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6794 sql_set_info.append( 6795 f""" 6796 concat( 6797 'PZScore{pzfields_sep}{profile}=', 6798 PZScore{pzfields_sep}{profile} 6799 ) 6800 """ 6801 ) 6802 if ( 6803 profile == default_profile 6804 and "PZScore" in list_of_pzfields 6805 ): 6806 sql_set_info.append( 6807 f""" 6808 concat( 6809 'PZScore=', 6810 PZScore{pzfields_sep}{profile} 6811 ) 6812 """ 6813 ) 6814 6815 # PZFlag 6816 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6817 sql_set_info.append( 6818 f""" 6819 concat( 6820 'PZFlag{pzfields_sep}{profile}=', 6821 CASE 6822 WHEN PZFlag{pzfields_sep}{profile}==1 6823 THEN 'PASS' 6824 WHEN PZFlag{pzfields_sep}{profile}==0 6825 THEN 'FILTERED' 6826 END 6827 ) 6828 """ 6829 ) 6830 if ( 6831 profile == default_profile 6832 and "PZFlag" in list_of_pzfields 6833 ): 6834 sql_set_info.append( 6835 f""" 6836 concat( 6837 'PZFlag=', 6838 CASE 6839 WHEN PZFlag{pzfields_sep}{profile}==1 6840 THEN 'PASS' 6841 WHEN PZFlag{pzfields_sep}{profile}==0 6842 THEN 'FILTERED' 6843 END 6844 ) 6845 """ 6846 ) 6847 6848 # PZComment 6849 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6850 sql_set_info.append( 6851 f""" 6852 CASE 6853 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6854 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6855 ELSE '' 6856 END 6857 """ 6858 ) 6859 if ( 6860 profile == default_profile 6861 and "PZComment" in list_of_pzfields 6862 ): 6863 sql_set_info.append( 6864 f""" 6865 CASE 6866 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6867 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6868 ELSE '' 6869 END 6870 """ 6871 ) 6872 6873 # PZInfos 6874 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6875 sql_set_info.append( 6876 f""" 6877 CASE 6878 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6879 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6880 ELSE '' 6881 END 6882 """ 6883 ) 6884 if ( 6885 profile == default_profile 6886 and "PZInfos" in list_of_pzfields 6887 ): 6888 sql_set_info.append( 6889 f""" 6890 CASE 6891 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6892 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6893 ELSE '' 6894 END 6895 """ 6896 ) 6897 6898 # Merge PZfields 6899 sql_set_info_option = "" 6900 sql_set_sep = "" 6901 for sql_set in sql_set_info: 6902 if sql_set_sep: 6903 sql_set_info_option += f""" 6904 , concat('{sql_set_sep}', {sql_set}) 6905 """ 6906 else: 6907 sql_set_info_option += f""" 6908 , {sql_set} 6909 """ 6910 sql_set_sep = ";" 6911 6912 sql_queries = [] 6913 for annotation in prioritizations_config[profile]: 6914 6915 # Check if annotation field is present 6916 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6917 log.debug(f"Annotation '{annotation}' not in data") 6918 continue 6919 else: 6920 log.debug(f"Annotation '{annotation}' in data") 6921 6922 # For each criterions 6923 for criterion in prioritizations_config[profile][ 6924 annotation 6925 ]: 6926 criterion_type = criterion["type"] 6927 criterion_value = criterion["value"] 6928 criterion_score = criterion.get("score", 0) 6929 criterion_flag = criterion.get("flag", "PASS") 6930 criterion_flag_bool = criterion_flag == "PASS" 6931 criterion_comment = ( 6932 ", ".join(criterion.get("comment", [])) 6933 .replace("'", "''") 6934 .replace(";", ",") 6935 .replace("\t", " ") 6936 ) 6937 criterion_infos = ( 6938 str(criterion) 6939 .replace("'", "''") 6940 .replace(";", ",") 6941 .replace("\t", " ") 6942 ) 6943 6944 sql_set = [] 6945 sql_set_info = [] 6946 6947 # PZ fields set 6948 if ( 6949 f"PZScore{pzfields_sep}{profile}" 6950 in list_of_pzfields 6951 ): 6952 if prioritization_score_mode == "HOWARD": 6953 sql_set.append( 6954 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6955 ) 6956 elif prioritization_score_mode == "VaRank": 6957 sql_set.append( 6958 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6959 ) 6960 else: 6961 sql_set.append( 6962 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6963 ) 6964 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6965 sql_set.append( 6966 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6967 ) 6968 if ( 6969 f"PZComment{pzfields_sep}{profile}" 6970 in list_of_pzfields 6971 ): 6972 sql_set.append( 6973 f""" 6974 PZComment{pzfields_sep}{profile} = 6975 concat( 6976 PZComment{pzfields_sep}{profile}, 6977 CASE 6978 WHEN PZComment{pzfields_sep}{profile}!='' 6979 THEN ', ' 6980 ELSE '' 6981 END, 6982 '{criterion_comment}' 6983 ) 6984 """ 6985 ) 6986 if ( 6987 f"PZInfos{pzfields_sep}{profile}" 6988 in list_of_pzfields 6989 ): 6990 sql_set.append( 6991 f""" 6992 PZInfos{pzfields_sep}{profile} = 6993 concat( 6994 PZInfos{pzfields_sep}{profile}, 6995 '{criterion_infos}' 6996 ) 6997 """ 6998 ) 6999 sql_set_option = ",".join(sql_set) 7000 7001 # Criterion and comparison 7002 try: 7003 float(criterion_value) 7004 sql_update = f""" 7005 UPDATE {table_variants} 7006 SET {sql_set_option} 7007 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7008 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7009 """ 7010 except: 7011 contains_option = "" 7012 if criterion_type == "contains": 7013 contains_option = ".*" 7014 sql_update = f""" 7015 UPDATE {table_variants} 7016 SET {sql_set_option} 7017 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7018 """ 7019 sql_queries.append(sql_update) 7020 7021 # PZTags 7022 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7023 7024 # Create PZFalgs value 7025 pztags_value = "" 7026 pztags_sep_default = "|" 7027 pztags_sep = "" 7028 for pzfield in pzfields: 7029 if pzfield not in ["PZTags"]: 7030 if ( 7031 f"{pzfield}{pzfields_sep}{profile}" 7032 in list_of_pzfields 7033 ): 7034 if pzfield in ["PZFlag"]: 7035 pztags_value += f"""{pztags_sep}{pzfield}#', 7036 CASE WHEN PZFlag{pzfields_sep}{profile} 7037 THEN 'PASS' 7038 ELSE 'FILTERED' 7039 END, '""" 7040 else: 7041 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7042 pztags_sep = pztags_sep_default 7043 7044 # Add Query update for PZFlags 7045 sql_update_pztags = f""" 7046 UPDATE {table_variants} 7047 SET INFO = concat( 7048 INFO, 7049 CASE WHEN INFO NOT in ('','.') 7050 THEN ';' 7051 ELSE '' 7052 END, 7053 'PZTags{pzfields_sep}{profile}={pztags_value}' 7054 ) 7055 """ 7056 sql_queries.append(sql_update_pztags) 7057 7058 # Add Query update for PZFlags for default 7059 if profile == default_profile: 7060 sql_update_pztags_default = f""" 7061 UPDATE {table_variants} 7062 SET INFO = concat( 7063 INFO, 7064 ';', 7065 'PZTags={pztags_value}' 7066 ) 7067 """ 7068 sql_queries.append(sql_update_pztags_default) 7069 7070 log.info(f"""Profile '{profile}' - Prioritization... """) 7071 7072 if sql_queries: 7073 7074 for sql_query in sql_queries: 7075 log.debug( 7076 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7077 ) 7078 self.conn.execute(sql_query) 7079 7080 log.info(f"""Profile '{profile}' - Update... """) 7081 sql_query_update = f""" 7082 UPDATE {table_variants} 7083 SET INFO = 7084 concat( 7085 CASE 7086 WHEN INFO NOT IN ('','.') 7087 THEN concat(INFO, ';') 7088 ELSE '' 7089 END 7090 {sql_set_info_option} 7091 ) 7092 """ 7093 self.conn.execute(sql_query_update) 7094 7095 else: 7096 7097 log.warning(f"No profiles in parameters") 7098 7099 # Remove added columns 7100 for added_column in added_columns: 7101 self.drop_column(column=added_column) 7102 7103 # Explode INFOS fields into table fields 7104 if self.get_explode_infos(): 7105 self.explode_infos( 7106 prefix=self.get_explode_infos_prefix(), 7107 fields=self.get_explode_infos_fields(), 7108 force=True, 7109 ) 7110 7111 return
It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields
7117 def annotation_hgvs(self, threads: int = None) -> None: 7118 """ 7119 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7120 coordinates and alleles. 7121 7122 :param threads: The `threads` parameter is an optional integer that specifies the number of 7123 threads to use for parallel processing. If no value is provided, it will default to the number 7124 of threads obtained from the `get_threads()` method 7125 :type threads: int 7126 """ 7127 7128 # Function for each partition of the Dask Dataframe 7129 def partition_function(partition): 7130 """ 7131 The function `partition_function` applies the `annotation_hgvs_partition` function to 7132 each row of a DataFrame called `partition`. 7133 7134 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7135 to be processed 7136 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7137 the "partition" dataframe along the axis 1. 7138 """ 7139 return partition.apply(annotation_hgvs_partition, axis=1) 7140 7141 def annotation_hgvs_partition(row) -> str: 7142 """ 7143 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7144 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7145 7146 :param row: A dictionary-like object that contains the values for the following keys: 7147 :return: a string that contains the HGVS names associated with the given row of data. 7148 """ 7149 7150 chr = row["CHROM"] 7151 pos = row["POS"] 7152 ref = row["REF"] 7153 alt = row["ALT"] 7154 7155 # Find list of associated transcripts 7156 transcripts_list = list( 7157 polars_conn.execute( 7158 f""" 7159 SELECT transcript 7160 FROM refseq_df 7161 WHERE CHROM='{chr}' 7162 AND POS={pos} 7163 """ 7164 )["transcript"] 7165 ) 7166 7167 # Full HGVS annotation in list 7168 hgvs_full_list = [] 7169 7170 for transcript_name in transcripts_list: 7171 7172 # Transcript 7173 transcript = get_transcript( 7174 transcripts=transcripts, transcript_name=transcript_name 7175 ) 7176 # Exon 7177 if use_exon: 7178 exon = transcript.find_exon_number(pos) 7179 else: 7180 exon = None 7181 # Protein 7182 transcript_protein = None 7183 if use_protein or add_protein or full_format: 7184 transcripts_protein = list( 7185 polars_conn.execute( 7186 f""" 7187 SELECT protein 7188 FROM refseqlink_df 7189 WHERE transcript='{transcript_name}' 7190 LIMIT 1 7191 """ 7192 )["protein"] 7193 ) 7194 if len(transcripts_protein): 7195 transcript_protein = transcripts_protein[0] 7196 7197 # HGVS name 7198 hgvs_name = format_hgvs_name( 7199 chr, 7200 pos, 7201 ref, 7202 alt, 7203 genome=genome, 7204 transcript=transcript, 7205 transcript_protein=transcript_protein, 7206 exon=exon, 7207 use_gene=use_gene, 7208 use_protein=use_protein, 7209 full_format=full_format, 7210 use_version=use_version, 7211 codon_type=codon_type, 7212 ) 7213 hgvs_full_list.append(hgvs_name) 7214 if add_protein and not use_protein and not full_format: 7215 hgvs_name = format_hgvs_name( 7216 chr, 7217 pos, 7218 ref, 7219 alt, 7220 genome=genome, 7221 transcript=transcript, 7222 transcript_protein=transcript_protein, 7223 exon=exon, 7224 use_gene=use_gene, 7225 use_protein=True, 7226 full_format=False, 7227 use_version=use_version, 7228 codon_type=codon_type, 7229 ) 7230 hgvs_full_list.append(hgvs_name) 7231 7232 # Create liste of HGVS annotations 7233 hgvs_full = ",".join(hgvs_full_list) 7234 7235 return hgvs_full 7236 7237 # Polars connexion 7238 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7239 7240 # Config 7241 config = self.get_config() 7242 7243 # Databases 7244 # Genome 7245 databases_genomes_folders = ( 7246 config.get("folders", {}) 7247 .get("databases", {}) 7248 .get("genomes", DEFAULT_GENOME_FOLDER) 7249 ) 7250 databases_genome = ( 7251 config.get("folders", {}).get("databases", {}).get("genomes", "") 7252 ) 7253 # refseq database folder 7254 databases_refseq_folders = ( 7255 config.get("folders", {}) 7256 .get("databases", {}) 7257 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7258 ) 7259 # refseq 7260 databases_refseq = config.get("databases", {}).get("refSeq", None) 7261 # refSeqLink 7262 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7263 7264 # Param 7265 param = self.get_param() 7266 7267 # Quick HGVS 7268 if "hgvs_options" in param and param.get("hgvs_options", ""): 7269 log.info(f"Quick HGVS Annotation:") 7270 if not param.get("hgvs", None): 7271 param["hgvs"] = {} 7272 for option in param.get("hgvs_options", "").split(","): 7273 option_var_val = option.split("=") 7274 option_var = option_var_val[0] 7275 if len(option_var_val) > 1: 7276 option_val = option_var_val[1] 7277 else: 7278 option_val = "True" 7279 if option_val.upper() in ["TRUE"]: 7280 option_val = True 7281 elif option_val.upper() in ["FALSE"]: 7282 option_val = False 7283 log.info(f" {option_var}={option_val}") 7284 param["hgvs"][option_var] = option_val 7285 7286 # Check if HGVS annotation enabled 7287 if "hgvs" in param: 7288 log.info(f"HGVS Annotation... ") 7289 for hgvs_option in param.get("hgvs", {}): 7290 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7291 else: 7292 return 7293 7294 # HGVS Param 7295 param_hgvs = param.get("hgvs", {}) 7296 use_exon = param_hgvs.get("use_exon", False) 7297 use_gene = param_hgvs.get("use_gene", False) 7298 use_protein = param_hgvs.get("use_protein", False) 7299 add_protein = param_hgvs.get("add_protein", False) 7300 full_format = param_hgvs.get("full_format", False) 7301 use_version = param_hgvs.get("use_version", False) 7302 codon_type = param_hgvs.get("codon_type", "3") 7303 7304 # refSseq refSeqLink 7305 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7306 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7307 7308 # Assembly 7309 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7310 7311 # Genome 7312 genome_file = None 7313 if find_genome(databases_genome): 7314 genome_file = find_genome(databases_genome) 7315 else: 7316 genome_file = find_genome( 7317 genome_path=databases_genomes_folders, assembly=assembly 7318 ) 7319 log.debug("Genome: " + str(genome_file)) 7320 7321 # refSseq 7322 refseq_file = find_file_prefix( 7323 input_file=databases_refseq, 7324 prefix="ncbiRefSeq", 7325 folder=databases_refseq_folders, 7326 assembly=assembly, 7327 ) 7328 log.debug("refSeq: " + str(refseq_file)) 7329 7330 # refSeqLink 7331 refseqlink_file = find_file_prefix( 7332 input_file=databases_refseqlink, 7333 prefix="ncbiRefSeqLink", 7334 folder=databases_refseq_folders, 7335 assembly=assembly, 7336 ) 7337 log.debug("refSeqLink: " + str(refseqlink_file)) 7338 7339 # Threads 7340 if not threads: 7341 threads = self.get_threads() 7342 log.debug("Threads: " + str(threads)) 7343 7344 # Variables 7345 table_variants = self.get_table_variants(clause="update") 7346 7347 # Get variants SNV and InDel only 7348 query_variants = f""" 7349 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7350 FROM {table_variants} 7351 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7352 """ 7353 df_variants = self.get_query_to_df(query_variants) 7354 7355 # Added columns 7356 added_columns = [] 7357 7358 # Add hgvs column in variants table 7359 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7360 added_column = self.add_column( 7361 table_variants, hgvs_column_name, "STRING", default_value=None 7362 ) 7363 added_columns.append(added_column) 7364 7365 log.debug(f"refSeq loading...") 7366 # refSeq in duckDB 7367 refseq_table = get_refseq_table( 7368 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7369 ) 7370 # Loading all refSeq in Dataframe 7371 refseq_query = f""" 7372 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7373 FROM {refseq_table} 7374 JOIN df_variants ON ( 7375 {refseq_table}.chrom = df_variants.CHROM 7376 AND {refseq_table}.txStart<=df_variants.POS 7377 AND {refseq_table}.txEnd>=df_variants.POS 7378 ) 7379 """ 7380 refseq_df = self.conn.query(refseq_query).pl() 7381 7382 if refseqlink_file: 7383 log.debug(f"refSeqLink loading...") 7384 # refSeqLink in duckDB 7385 refseqlink_table = get_refseq_table( 7386 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7387 ) 7388 # Loading all refSeqLink in Dataframe 7389 protacc_column = "protAcc_with_ver" 7390 mrnaacc_column = "mrnaAcc_with_ver" 7391 refseqlink_query = f""" 7392 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7393 FROM {refseqlink_table} 7394 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7395 WHERE protAcc_without_ver IS NOT NULL 7396 """ 7397 # Polars Dataframe 7398 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7399 7400 # Read RefSeq transcripts into a python dict/model. 7401 log.debug(f"Transcripts loading...") 7402 with tempfile.TemporaryDirectory() as tmpdir: 7403 transcripts_query = f""" 7404 COPY ( 7405 SELECT {refseq_table}.* 7406 FROM {refseq_table} 7407 JOIN df_variants ON ( 7408 {refseq_table}.chrom=df_variants.CHROM 7409 AND {refseq_table}.txStart<=df_variants.POS 7410 AND {refseq_table}.txEnd>=df_variants.POS 7411 ) 7412 ) 7413 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7414 """ 7415 self.conn.query(transcripts_query) 7416 with open(f"{tmpdir}/transcript.tsv") as infile: 7417 transcripts = read_transcripts(infile) 7418 7419 # Polars connexion 7420 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7421 7422 log.debug("Genome loading...") 7423 # Read genome sequence using pyfaidx. 7424 genome = Fasta(genome_file) 7425 7426 log.debug("Start annotation HGVS...") 7427 7428 # Create 7429 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7430 ddf = dd.from_pandas(df_variants, npartitions=threads) 7431 7432 # Use dask.dataframe.apply() to apply function on each partition 7433 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7434 7435 # Convert Dask DataFrame to Pandas Dataframe 7436 df = ddf.compute() 7437 7438 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7439 with tempfile.TemporaryDirectory() as tmpdir: 7440 df_parquet = os.path.join(tmpdir, "df.parquet") 7441 df.to_parquet(df_parquet) 7442 7443 # Update hgvs column 7444 update_variant_query = f""" 7445 UPDATE {table_variants} 7446 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7447 FROM read_parquet('{df_parquet}') as df 7448 WHERE variants."#CHROM" = df.CHROM 7449 AND variants.POS = df.POS 7450 AND variants.REF = df.REF 7451 AND variants.ALT = df.ALT 7452 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7453 """ 7454 self.execute_query(update_variant_query) 7455 7456 # Update INFO column 7457 sql_query_update = f""" 7458 UPDATE {table_variants} 7459 SET INFO = 7460 concat( 7461 CASE 7462 WHEN INFO NOT IN ('','.') 7463 THEN concat(INFO, ';') 7464 ELSE '' 7465 END, 7466 'hgvs=', 7467 {hgvs_column_name} 7468 ) 7469 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7470 """ 7471 self.execute_query(sql_query_update) 7472 7473 # Add header 7474 HGVS_INFOS = { 7475 "hgvs": { 7476 "ID": "hgvs", 7477 "Number": ".", 7478 "Type": "String", 7479 "Description": f"HGVS annotatation with HOWARD", 7480 } 7481 } 7482 7483 for field in HGVS_INFOS: 7484 field_ID = HGVS_INFOS[field]["ID"] 7485 field_description = HGVS_INFOS[field]["Description"] 7486 self.get_header().infos[field_ID] = vcf.parser._Info( 7487 field_ID, 7488 HGVS_INFOS[field]["Number"], 7489 HGVS_INFOS[field]["Type"], 7490 field_description, 7491 "unknown", 7492 "unknown", 7493 code_type_map[HGVS_INFOS[field]["Type"]], 7494 ) 7495 7496 # Remove added columns 7497 for added_column in added_columns: 7498 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7504 def get_operations_help( 7505 self, operations_config_dict: dict = {}, operations_config_file: str = None 7506 ) -> list: 7507 7508 # Init 7509 operations_help = [] 7510 7511 # operations 7512 operations = self.get_config_json( 7513 name="calculations", 7514 config_dict=operations_config_dict, 7515 config_file=operations_config_file, 7516 ) 7517 for op in operations: 7518 op_name = operations[op].get("name", op).upper() 7519 op_description = operations[op].get("description", op_name) 7520 op_available = operations[op].get("available", False) 7521 if op_available: 7522 operations_help.append(f" {op_name}: {op_description}") 7523 7524 # Sort operations 7525 operations_help.sort() 7526 7527 # insert header 7528 operations_help.insert(0, "Available calculation operations:") 7529 7530 # Return 7531 return operations_help
7533 def calculation( 7534 self, 7535 operations: dict = {}, 7536 operations_config_dict: dict = {}, 7537 operations_config_file: str = None, 7538 ) -> None: 7539 """ 7540 It takes a list of operations, and for each operation, it checks if it's a python or sql 7541 operation, and then calls the appropriate function 7542 7543 param json example: 7544 "calculation": { 7545 "NOMEN": { 7546 "options": { 7547 "hgvs_field": "hgvs" 7548 }, 7549 "middle" : null 7550 } 7551 """ 7552 7553 # Param 7554 param = self.get_param() 7555 7556 # operations config 7557 operations_config = self.get_config_json( 7558 name="calculations", 7559 config_dict=operations_config_dict, 7560 config_file=operations_config_file, 7561 ) 7562 7563 # Upper keys 7564 operations_config = {k.upper(): v for k, v in operations_config.items()} 7565 7566 # Calculations 7567 7568 # Operations from param 7569 operations = param.get("calculation", {}).get("calculations", operations) 7570 7571 # Quick calculation - add 7572 if param.get("calculations", None): 7573 calculations_list = [ 7574 value for value in param.get("calculations", "").split(",") 7575 ] 7576 log.info(f"Quick Calculations:") 7577 for calculation_key in calculations_list: 7578 log.info(f" {calculation_key}") 7579 for calculation_operation in calculations_list: 7580 if calculation_operation.upper() not in operations: 7581 operations[calculation_operation.upper()] = {} 7582 add_value_into_dict( 7583 dict_tree=param, 7584 sections=[ 7585 "calculation", 7586 "calculations", 7587 calculation_operation.upper(), 7588 ], 7589 value={}, 7590 ) 7591 7592 # Operations for calculation 7593 if not operations: 7594 operations = param.get("calculation", {}).get("calculations", {}) 7595 7596 if operations: 7597 log.info(f"Calculations...") 7598 7599 # For each operations 7600 for operation_name in operations: 7601 operation_name = operation_name.upper() 7602 if operation_name not in [""]: 7603 if operation_name in operations_config: 7604 log.info(f"Calculation '{operation_name}'") 7605 operation = operations_config[operation_name] 7606 operation_type = operation.get("type", "sql") 7607 if operation_type == "python": 7608 self.calculation_process_function( 7609 operation=operation, operation_name=operation_name 7610 ) 7611 elif operation_type == "sql": 7612 self.calculation_process_sql( 7613 operation=operation, operation_name=operation_name 7614 ) 7615 else: 7616 log.error( 7617 f"Operations config: Type '{operation_type}' NOT available" 7618 ) 7619 raise ValueError( 7620 f"Operations config: Type '{operation_type}' NOT available" 7621 ) 7622 else: 7623 log.error( 7624 f"Operations config: Calculation '{operation_name}' NOT available" 7625 ) 7626 raise ValueError( 7627 f"Operations config: Calculation '{operation_name}' NOT available" 7628 ) 7629 7630 # Explode INFOS fields into table fields 7631 if self.get_explode_infos(): 7632 self.explode_infos( 7633 prefix=self.get_explode_infos_prefix(), 7634 fields=self.get_explode_infos_fields(), 7635 force=True, 7636 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7638 def calculation_process_sql( 7639 self, operation: dict, operation_name: str = "unknown" 7640 ) -> None: 7641 """ 7642 The `calculation_process_sql` function takes in a mathematical operation as a string and 7643 performs the operation, updating the specified table with the result. 7644 7645 :param operation: The `operation` parameter is a dictionary that contains information about the 7646 mathematical operation to be performed. It includes the following keys: 7647 :type operation: dict 7648 :param operation_name: The `operation_name` parameter is a string that represents the name of 7649 the mathematical operation being performed. It is used for logging and error handling purposes, 7650 defaults to unknown 7651 :type operation_name: str (optional) 7652 """ 7653 7654 # table variants 7655 table_variants = self.get_table_variants(clause="alter") 7656 7657 # Operation infos 7658 operation_name = operation.get("name", "unknown") 7659 log.debug(f"process sql {operation_name}") 7660 output_column_name = operation.get("output_column_name", operation_name) 7661 output_column_type = operation.get("output_column_type", "String") 7662 prefix = operation.get("explode_infos_prefix", "") 7663 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7664 output_column_description = operation.get( 7665 "output_column_description", f"{operation_name} operation" 7666 ) 7667 operation_query = operation.get("operation_query", None) 7668 if isinstance(operation_query, list): 7669 operation_query = " ".join(operation_query) 7670 operation_info_fields = operation.get("info_fields", []) 7671 operation_info_fields_check = operation.get("info_fields_check", False) 7672 operation_info = operation.get("operation_info", True) 7673 7674 if operation_query: 7675 7676 # Info fields check 7677 operation_info_fields_check_result = True 7678 if operation_info_fields_check: 7679 header_infos = self.get_header().infos 7680 for info_field in operation_info_fields: 7681 operation_info_fields_check_result = ( 7682 operation_info_fields_check_result 7683 and info_field in header_infos 7684 ) 7685 7686 # If info fields available 7687 if operation_info_fields_check_result: 7688 7689 # Added_columns 7690 added_columns = [] 7691 7692 # Create VCF header field 7693 vcf_reader = self.get_header() 7694 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7695 output_column_name, 7696 ".", 7697 output_column_type, 7698 output_column_description, 7699 "howard calculation", 7700 "0", 7701 self.code_type_map.get(output_column_type), 7702 ) 7703 7704 # Explode infos if needed 7705 log.debug(f"calculation_process_sql prefix {prefix}") 7706 added_columns += self.explode_infos( 7707 prefix=prefix, 7708 fields=[output_column_name] + operation_info_fields, 7709 force=True, 7710 ) 7711 7712 # Create column 7713 added_column = self.add_column( 7714 table_name=table_variants, 7715 column_name=prefix + output_column_name, 7716 column_type=output_column_type_sql, 7717 default_value="null", 7718 ) 7719 added_columns.append(added_column) 7720 7721 # Operation calculation 7722 try: 7723 7724 # Query to update calculation column 7725 sql_update = f""" 7726 UPDATE {table_variants} 7727 SET "{prefix}{output_column_name}" = ({operation_query}) 7728 """ 7729 self.conn.execute(sql_update) 7730 7731 # Add to INFO 7732 if operation_info: 7733 sql_update_info = f""" 7734 UPDATE {table_variants} 7735 SET "INFO" = 7736 concat( 7737 CASE 7738 WHEN "INFO" IS NOT NULL 7739 THEN concat("INFO", ';') 7740 ELSE '' 7741 END, 7742 '{output_column_name}=', 7743 "{prefix}{output_column_name}" 7744 ) 7745 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7746 """ 7747 self.conn.execute(sql_update_info) 7748 7749 except: 7750 log.error( 7751 f"Operations config: Calculation '{operation_name}' query failed" 7752 ) 7753 raise ValueError( 7754 f"Operations config: Calculation '{operation_name}' query failed" 7755 ) 7756 7757 # Remove added columns 7758 for added_column in added_columns: 7759 log.debug(f"added_column: {added_column}") 7760 self.drop_column(column=added_column) 7761 7762 else: 7763 log.error( 7764 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7765 ) 7766 raise ValueError( 7767 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7768 ) 7769 7770 else: 7771 log.error( 7772 f"Operations config: Calculation '{operation_name}' query NOT defined" 7773 ) 7774 raise ValueError( 7775 f"Operations config: Calculation '{operation_name}' query NOT defined" 7776 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
7778 def calculation_process_function( 7779 self, operation: dict, operation_name: str = "unknown" 7780 ) -> None: 7781 """ 7782 The `calculation_process_function` takes in an operation dictionary and performs the specified 7783 function with the given parameters. 7784 7785 :param operation: The `operation` parameter is a dictionary that contains information about the 7786 operation to be performed. It has the following keys: 7787 :type operation: dict 7788 :param operation_name: The `operation_name` parameter is a string that represents the name of 7789 the operation being performed. It is used for logging purposes, defaults to unknown 7790 :type operation_name: str (optional) 7791 """ 7792 7793 operation_name = operation["name"] 7794 log.debug(f"process sql {operation_name}") 7795 function_name = operation["function_name"] 7796 function_params = operation["function_params"] 7797 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
7799 def calculation_variant_id(self) -> None: 7800 """ 7801 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7802 updates the INFO field of a variants table with the variant ID. 7803 """ 7804 7805 # variant_id annotation field 7806 variant_id_tag = self.get_variant_id_column() 7807 added_columns = [variant_id_tag] 7808 7809 # variant_id hgvs tags" 7810 vcf_infos_tags = { 7811 variant_id_tag: "howard variant ID annotation", 7812 } 7813 7814 # Variants table 7815 table_variants = self.get_table_variants() 7816 7817 # Header 7818 vcf_reader = self.get_header() 7819 7820 # Add variant_id to header 7821 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7822 variant_id_tag, 7823 ".", 7824 "String", 7825 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7826 "howard calculation", 7827 "0", 7828 self.code_type_map.get("String"), 7829 ) 7830 7831 # Update 7832 sql_update = f""" 7833 UPDATE {table_variants} 7834 SET "INFO" = 7835 concat( 7836 CASE 7837 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7838 THEN '' 7839 ELSE concat("INFO", ';') 7840 END, 7841 '{variant_id_tag}=', 7842 "{variant_id_tag}" 7843 ) 7844 """ 7845 self.conn.execute(sql_update) 7846 7847 # Remove added columns 7848 for added_column in added_columns: 7849 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
7851 def calculation_extract_snpeff_hgvs( 7852 self, 7853 snpeff_hgvs: str = "snpeff_hgvs", 7854 snpeff_field: str = "ANN", 7855 ) -> None: 7856 """ 7857 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7858 annotation field in a VCF file and adds them as a new column in the variants table. 7859 7860 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7861 function is used to specify the name of the column that will store the HGVS nomenclatures 7862 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7863 snpeff_hgvs 7864 :type snpeff_hgvs: str (optional) 7865 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7866 function represents the field in the VCF file that contains SnpEff annotations. This field is 7867 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7868 to ANN 7869 :type snpeff_field: str (optional) 7870 """ 7871 7872 # Snpeff hgvs tags 7873 vcf_infos_tags = { 7874 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7875 } 7876 7877 # Prefix 7878 prefix = self.get_explode_infos_prefix() 7879 if prefix: 7880 prefix = "INFO/" 7881 7882 # snpEff fields 7883 speff_ann_infos = prefix + snpeff_field 7884 speff_hgvs_infos = prefix + snpeff_hgvs 7885 7886 # Variants table 7887 table_variants = self.get_table_variants() 7888 7889 # Header 7890 vcf_reader = self.get_header() 7891 7892 # Add columns 7893 added_columns = [] 7894 7895 # Explode HGVS field in column 7896 added_columns += self.explode_infos(fields=[snpeff_field]) 7897 7898 if snpeff_field in vcf_reader.infos: 7899 7900 log.debug(vcf_reader.infos[snpeff_field]) 7901 7902 # Extract ANN header 7903 ann_description = vcf_reader.infos[snpeff_field].desc 7904 pattern = r"'(.+?)'" 7905 match = re.search(pattern, ann_description) 7906 if match: 7907 ann_header_match = match.group(1).split(" | ") 7908 ann_header_desc = {} 7909 for i in range(len(ann_header_match)): 7910 ann_header_info = "".join( 7911 char for char in ann_header_match[i] if char.isalnum() 7912 ) 7913 ann_header_desc[ann_header_info] = ann_header_match[i] 7914 if not ann_header_desc: 7915 raise ValueError("Invalid header description format") 7916 else: 7917 raise ValueError("Invalid header description format") 7918 7919 # Create variant id 7920 variant_id_column = self.get_variant_id_column() 7921 added_columns += [variant_id_column] 7922 7923 # Create dataframe 7924 dataframe_snpeff_hgvs = self.get_query_to_df( 7925 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7926 ) 7927 7928 # Create main NOMEN column 7929 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7930 speff_ann_infos 7931 ].apply( 7932 lambda x: extract_snpeff_hgvs( 7933 str(x), header=list(ann_header_desc.values()) 7934 ) 7935 ) 7936 7937 # Add snpeff_hgvs to header 7938 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7939 snpeff_hgvs, 7940 ".", 7941 "String", 7942 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7943 "howard calculation", 7944 "0", 7945 self.code_type_map.get("String"), 7946 ) 7947 7948 # Update 7949 sql_update = f""" 7950 UPDATE variants 7951 SET "INFO" = 7952 concat( 7953 CASE 7954 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7955 THEN '' 7956 ELSE concat("INFO", ';') 7957 END, 7958 CASE 7959 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7960 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7961 THEN concat( 7962 '{snpeff_hgvs}=', 7963 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7964 ) 7965 ELSE '' 7966 END 7967 ) 7968 FROM dataframe_snpeff_hgvs 7969 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7970 7971 """ 7972 self.conn.execute(sql_update) 7973 7974 # Delete dataframe 7975 del dataframe_snpeff_hgvs 7976 gc.collect() 7977 7978 else: 7979 7980 log.warning( 7981 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7982 ) 7983 7984 # Remove added columns 7985 for added_column in added_columns: 7986 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
7988 def calculation_snpeff_ann_explode( 7989 self, 7990 uniquify: bool = True, 7991 output_format: str = "fields", 7992 output_prefix: str = "snpeff_", 7993 snpeff_field: str = "ANN", 7994 ) -> None: 7995 """ 7996 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7997 exploding the HGVS field and updating variant information accordingly. 7998 7999 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8000 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8001 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8002 defaults to True 8003 :type uniquify: bool (optional) 8004 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8005 function specifies the format in which the output annotations will be generated. It has a 8006 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8007 format, defaults to fields 8008 :type output_format: str (optional) 8009 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8010 method is used to specify the prefix that will be added to the output annotations generated 8011 during the calculation process. This prefix helps to differentiate the newly added annotations 8012 from existing ones in the output data. By default, the, defaults to ANN_ 8013 :type output_prefix: str (optional) 8014 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8015 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8016 field will be processed to explode the HGVS annotations and update the variant information 8017 accordingly, defaults to ANN 8018 :type snpeff_field: str (optional) 8019 """ 8020 8021 # SnpEff annotation field 8022 snpeff_hgvs = "snpeff_ann_explode" 8023 8024 # Snpeff hgvs tags 8025 vcf_infos_tags = { 8026 snpeff_hgvs: "Explode snpEff annotations", 8027 } 8028 8029 # Prefix 8030 prefix = self.get_explode_infos_prefix() 8031 if prefix: 8032 prefix = "INFO/" 8033 8034 # snpEff fields 8035 speff_ann_infos = prefix + snpeff_field 8036 speff_hgvs_infos = prefix + snpeff_hgvs 8037 8038 # Variants table 8039 table_variants = self.get_table_variants() 8040 8041 # Header 8042 vcf_reader = self.get_header() 8043 8044 # Add columns 8045 added_columns = [] 8046 8047 # Explode HGVS field in column 8048 added_columns += self.explode_infos(fields=[snpeff_field]) 8049 log.debug(f"snpeff_field={snpeff_field}") 8050 log.debug(f"added_columns={added_columns}") 8051 8052 if snpeff_field in vcf_reader.infos: 8053 8054 # Extract ANN header 8055 ann_description = vcf_reader.infos[snpeff_field].desc 8056 pattern = r"'(.+?)'" 8057 match = re.search(pattern, ann_description) 8058 if match: 8059 ann_header_match = match.group(1).split(" | ") 8060 ann_header = [] 8061 ann_header_desc = {} 8062 for i in range(len(ann_header_match)): 8063 ann_header_info = "".join( 8064 char for char in ann_header_match[i] if char.isalnum() 8065 ) 8066 ann_header.append(ann_header_info) 8067 ann_header_desc[ann_header_info] = ann_header_match[i] 8068 if not ann_header_desc: 8069 raise ValueError("Invalid header description format") 8070 else: 8071 raise ValueError("Invalid header description format") 8072 8073 # Create variant id 8074 variant_id_column = self.get_variant_id_column() 8075 added_columns += [variant_id_column] 8076 8077 # Create dataframe 8078 dataframe_snpeff_hgvs = self.get_query_to_df( 8079 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8080 ) 8081 8082 # Create snpEff columns 8083 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8084 speff_ann_infos 8085 ].apply( 8086 lambda x: explode_snpeff_ann( 8087 str(x), 8088 uniquify=uniquify, 8089 output_format=output_format, 8090 prefix=output_prefix, 8091 header=list(ann_header_desc.values()), 8092 ) 8093 ) 8094 8095 # Header 8096 ann_annotations_prefix = "" 8097 if output_format.upper() in ["JSON"]: 8098 ann_annotations_prefix = f"{output_prefix}=" 8099 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8100 output_prefix, 8101 ".", 8102 "String", 8103 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8104 + " - JSON format", 8105 "howard calculation", 8106 "0", 8107 self.code_type_map.get("String"), 8108 ) 8109 else: 8110 for ann_annotation in ann_header: 8111 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8112 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8113 ann_annotation_id, 8114 ".", 8115 "String", 8116 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8117 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8118 "howard calculation", 8119 "0", 8120 self.code_type_map.get("String"), 8121 ) 8122 8123 # Update 8124 sql_update = f""" 8125 UPDATE variants 8126 SET "INFO" = 8127 concat( 8128 CASE 8129 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8130 THEN '' 8131 ELSE concat("INFO", ';') 8132 END, 8133 CASE 8134 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8135 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8136 THEN concat( 8137 '{ann_annotations_prefix}', 8138 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8139 ) 8140 ELSE '' 8141 END 8142 ) 8143 FROM dataframe_snpeff_hgvs 8144 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8145 8146 """ 8147 self.conn.execute(sql_update) 8148 8149 # Delete dataframe 8150 del dataframe_snpeff_hgvs 8151 gc.collect() 8152 8153 else: 8154 8155 log.warning( 8156 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8157 ) 8158 8159 # Remove added columns 8160 for added_column in added_columns: 8161 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8163 def calculation_extract_nomen(self) -> None: 8164 """ 8165 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8166 """ 8167 8168 # NOMEN field 8169 field_nomen_dict = "NOMEN_DICT" 8170 8171 # NOMEN structure 8172 nomen_dict = { 8173 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8174 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8175 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8176 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8177 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8178 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8179 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8180 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8181 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8182 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8183 } 8184 8185 # Param 8186 param = self.get_param() 8187 8188 # Prefix 8189 prefix = self.get_explode_infos_prefix() 8190 8191 # Header 8192 vcf_reader = self.get_header() 8193 8194 # Get HGVS field 8195 hgvs_field = ( 8196 param.get("calculation", {}) 8197 .get("calculations", {}) 8198 .get("NOMEN", {}) 8199 .get("options", {}) 8200 .get("hgvs_field", "hgvs") 8201 ) 8202 8203 # Get transcripts 8204 transcripts_file = ( 8205 param.get("calculation", {}) 8206 .get("calculations", {}) 8207 .get("NOMEN", {}) 8208 .get("options", {}) 8209 .get("transcripts", None) 8210 ) 8211 transcripts_file = full_path(transcripts_file) 8212 transcripts = [] 8213 if transcripts_file: 8214 if os.path.exists(transcripts_file): 8215 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8216 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8217 else: 8218 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8219 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8220 8221 # Added columns 8222 added_columns = [] 8223 8224 # Explode HGVS field in column 8225 added_columns += self.explode_infos(fields=[hgvs_field]) 8226 8227 # extra infos 8228 extra_infos = self.get_extra_infos() 8229 extra_field = prefix + hgvs_field 8230 8231 if extra_field in extra_infos: 8232 8233 # Create dataframe 8234 dataframe_hgvs = self.get_query_to_df( 8235 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8236 ) 8237 8238 # Create main NOMEN column 8239 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8240 lambda x: find_nomen(str(x), transcripts=transcripts) 8241 ) 8242 8243 # Explode NOMEN Structure and create SQL set for update 8244 sql_nomen_fields = [] 8245 for nomen_field in nomen_dict: 8246 8247 # Explode each field into a column 8248 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8249 lambda x: dict(x).get(nomen_field, "") 8250 ) 8251 8252 # Create VCF header field 8253 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8254 nomen_field, 8255 ".", 8256 "String", 8257 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8258 "howard calculation", 8259 "0", 8260 self.code_type_map.get("String"), 8261 ) 8262 sql_nomen_fields.append( 8263 f""" 8264 CASE 8265 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8266 THEN concat( 8267 ';{nomen_field}=', 8268 dataframe_hgvs."{nomen_field}" 8269 ) 8270 ELSE '' 8271 END 8272 """ 8273 ) 8274 8275 # SQL set for update 8276 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8277 8278 # Update 8279 sql_update = f""" 8280 UPDATE variants 8281 SET "INFO" = 8282 concat( 8283 CASE 8284 WHEN "INFO" IS NULL 8285 THEN '' 8286 ELSE "INFO" 8287 END, 8288 {sql_nomen_fields_set} 8289 ) 8290 FROM dataframe_hgvs 8291 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8292 AND variants."POS" = dataframe_hgvs."POS" 8293 AND variants."REF" = dataframe_hgvs."REF" 8294 AND variants."ALT" = dataframe_hgvs."ALT" 8295 """ 8296 self.conn.execute(sql_update) 8297 8298 # Delete dataframe 8299 del dataframe_hgvs 8300 gc.collect() 8301 8302 # Remove added columns 8303 for added_column in added_columns: 8304 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8306 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8307 """ 8308 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8309 pipeline/sample for a variant and updates the variant information in a VCF file. 8310 8311 :param tag: The `tag` parameter is a string that represents the annotation field for the 8312 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8313 VCF header and to update the corresponding field in the variants table, defaults to 8314 findbypipeline 8315 :type tag: str (optional) 8316 """ 8317 8318 # if FORMAT and samples 8319 if ( 8320 "FORMAT" in self.get_header_columns_as_list() 8321 and self.get_header_sample_list() 8322 ): 8323 8324 # findbypipeline annotation field 8325 findbypipeline_tag = tag 8326 8327 # VCF infos tags 8328 vcf_infos_tags = { 8329 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8330 } 8331 8332 # Prefix 8333 prefix = self.get_explode_infos_prefix() 8334 8335 # Field 8336 findbypipeline_infos = prefix + findbypipeline_tag 8337 8338 # Variants table 8339 table_variants = self.get_table_variants() 8340 8341 # Header 8342 vcf_reader = self.get_header() 8343 8344 # Create variant id 8345 variant_id_column = self.get_variant_id_column() 8346 added_columns = [variant_id_column] 8347 8348 # variant_id, FORMAT and samples 8349 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8350 self.get_header_sample_list() 8351 ) 8352 8353 # Create dataframe 8354 dataframe_findbypipeline = self.get_query_to_df( 8355 f""" SELECT {samples_fields} FROM {table_variants} """ 8356 ) 8357 8358 # Create findbypipeline column 8359 dataframe_findbypipeline[findbypipeline_infos] = ( 8360 dataframe_findbypipeline.apply( 8361 lambda row: findbypipeline( 8362 row, samples=self.get_header_sample_list() 8363 ), 8364 axis=1, 8365 ) 8366 ) 8367 8368 # Add snpeff_hgvs to header 8369 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8370 findbypipeline_tag, 8371 ".", 8372 "String", 8373 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8374 "howard calculation", 8375 "0", 8376 self.code_type_map.get("String"), 8377 ) 8378 8379 # Update 8380 sql_update = f""" 8381 UPDATE variants 8382 SET "INFO" = 8383 concat( 8384 CASE 8385 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8386 THEN '' 8387 ELSE concat("INFO", ';') 8388 END, 8389 CASE 8390 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8391 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8392 THEN concat( 8393 '{findbypipeline_tag}=', 8394 dataframe_findbypipeline."{findbypipeline_infos}" 8395 ) 8396 ELSE '' 8397 END 8398 ) 8399 FROM dataframe_findbypipeline 8400 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8401 """ 8402 self.conn.execute(sql_update) 8403 8404 # Remove added columns 8405 for added_column in added_columns: 8406 self.drop_column(column=added_column) 8407 8408 # Delete dataframe 8409 del dataframe_findbypipeline 8410 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8412 def calculation_genotype_concordance(self) -> None: 8413 """ 8414 The function `calculation_genotype_concordance` calculates the genotype concordance for 8415 multi-caller VCF files and updates the variant information in the database. 8416 """ 8417 8418 # if FORMAT and samples 8419 if ( 8420 "FORMAT" in self.get_header_columns_as_list() 8421 and self.get_header_sample_list() 8422 ): 8423 8424 # genotypeconcordance annotation field 8425 genotypeconcordance_tag = "genotypeconcordance" 8426 8427 # VCF infos tags 8428 vcf_infos_tags = { 8429 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8430 } 8431 8432 # Prefix 8433 prefix = self.get_explode_infos_prefix() 8434 8435 # Field 8436 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8437 8438 # Variants table 8439 table_variants = self.get_table_variants() 8440 8441 # Header 8442 vcf_reader = self.get_header() 8443 8444 # Create variant id 8445 variant_id_column = self.get_variant_id_column() 8446 added_columns = [variant_id_column] 8447 8448 # variant_id, FORMAT and samples 8449 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8450 self.get_header_sample_list() 8451 ) 8452 8453 # Create dataframe 8454 dataframe_genotypeconcordance = self.get_query_to_df( 8455 f""" SELECT {samples_fields} FROM {table_variants} """ 8456 ) 8457 8458 # Create genotypeconcordance column 8459 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8460 dataframe_genotypeconcordance.apply( 8461 lambda row: genotypeconcordance( 8462 row, samples=self.get_header_sample_list() 8463 ), 8464 axis=1, 8465 ) 8466 ) 8467 8468 # Add genotypeconcordance to header 8469 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8470 genotypeconcordance_tag, 8471 ".", 8472 "String", 8473 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8474 "howard calculation", 8475 "0", 8476 self.code_type_map.get("String"), 8477 ) 8478 8479 # Update 8480 sql_update = f""" 8481 UPDATE variants 8482 SET "INFO" = 8483 concat( 8484 CASE 8485 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8486 THEN '' 8487 ELSE concat("INFO", ';') 8488 END, 8489 CASE 8490 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8491 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8492 THEN concat( 8493 '{genotypeconcordance_tag}=', 8494 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8495 ) 8496 ELSE '' 8497 END 8498 ) 8499 FROM dataframe_genotypeconcordance 8500 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8501 """ 8502 self.conn.execute(sql_update) 8503 8504 # Remove added columns 8505 for added_column in added_columns: 8506 self.drop_column(column=added_column) 8507 8508 # Delete dataframe 8509 del dataframe_genotypeconcordance 8510 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8512 def calculation_barcode(self, tag: str = "barcode") -> None: 8513 """ 8514 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8515 updates the INFO field in the file with the calculated barcode values. 8516 8517 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8518 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8519 the default tag name is set to "barcode", defaults to barcode 8520 :type tag: str (optional) 8521 """ 8522 8523 # if FORMAT and samples 8524 if ( 8525 "FORMAT" in self.get_header_columns_as_list() 8526 and self.get_header_sample_list() 8527 ): 8528 8529 # barcode annotation field 8530 if not tag: 8531 tag = "barcode" 8532 8533 # VCF infos tags 8534 vcf_infos_tags = { 8535 tag: "barcode calculation (VaRank)", 8536 } 8537 8538 # Prefix 8539 prefix = self.get_explode_infos_prefix() 8540 8541 # Field 8542 barcode_infos = prefix + tag 8543 8544 # Variants table 8545 table_variants = self.get_table_variants() 8546 8547 # Header 8548 vcf_reader = self.get_header() 8549 8550 # Create variant id 8551 variant_id_column = self.get_variant_id_column() 8552 added_columns = [variant_id_column] 8553 8554 # variant_id, FORMAT and samples 8555 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8556 self.get_header_sample_list() 8557 ) 8558 8559 # Create dataframe 8560 dataframe_barcode = self.get_query_to_df( 8561 f""" SELECT {samples_fields} FROM {table_variants} """ 8562 ) 8563 8564 # Create barcode column 8565 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8566 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8567 ) 8568 8569 # Add barcode to header 8570 vcf_reader.infos[tag] = vcf.parser._Info( 8571 tag, 8572 ".", 8573 "String", 8574 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8575 "howard calculation", 8576 "0", 8577 self.code_type_map.get("String"), 8578 ) 8579 8580 # Update 8581 sql_update = f""" 8582 UPDATE {table_variants} 8583 SET "INFO" = 8584 concat( 8585 CASE 8586 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8587 THEN '' 8588 ELSE concat("INFO", ';') 8589 END, 8590 CASE 8591 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8592 AND dataframe_barcode."{barcode_infos}" NOT NULL 8593 THEN concat( 8594 '{tag}=', 8595 dataframe_barcode."{barcode_infos}" 8596 ) 8597 ELSE '' 8598 END 8599 ) 8600 FROM dataframe_barcode 8601 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8602 """ 8603 self.conn.execute(sql_update) 8604 8605 # Remove added columns 8606 for added_column in added_columns: 8607 self.drop_column(column=added_column) 8608 8609 # Delete dataframe 8610 del dataframe_barcode 8611 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8613 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8614 """ 8615 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8616 and updates the INFO field in the file with the calculated barcode values. 8617 8618 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8619 the barcode tag that will be added to the VCF file during the calculation process. If no value 8620 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8621 :type tag: str (optional) 8622 """ 8623 8624 # if FORMAT and samples 8625 if ( 8626 "FORMAT" in self.get_header_columns_as_list() 8627 and self.get_header_sample_list() 8628 ): 8629 8630 # barcode annotation field 8631 if not tag: 8632 tag = "BCF" 8633 8634 # VCF infos tags 8635 vcf_infos_tags = { 8636 tag: "barcode family calculation", 8637 f"{tag}S": "barcode family samples", 8638 } 8639 8640 # Param 8641 param = self.get_param() 8642 log.debug(f"param={param}") 8643 8644 # Prefix 8645 prefix = self.get_explode_infos_prefix() 8646 8647 # PED param 8648 ped = ( 8649 param.get("calculation", {}) 8650 .get("calculations", {}) 8651 .get("BARCODEFAMILY", {}) 8652 .get("family_pedigree", None) 8653 ) 8654 log.debug(f"ped={ped}") 8655 8656 # Load PED 8657 if ped: 8658 8659 # Pedigree is a file 8660 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8661 log.debug("Pedigree is file") 8662 with open(full_path(ped)) as ped: 8663 ped = json.load(ped) 8664 8665 # Pedigree is a string 8666 elif isinstance(ped, str): 8667 log.debug("Pedigree is str") 8668 try: 8669 ped = json.loads(ped) 8670 log.debug("Pedigree is json str") 8671 except ValueError as e: 8672 ped_samples = ped.split(",") 8673 ped = {} 8674 for ped_sample in ped_samples: 8675 ped[ped_sample] = ped_sample 8676 8677 # Pedigree is a dict 8678 elif isinstance(ped, dict): 8679 log.debug("Pedigree is dict") 8680 8681 # Pedigree is not well formatted 8682 else: 8683 msg_error = "Pedigree not well formatted" 8684 log.error(msg_error) 8685 raise ValueError(msg_error) 8686 8687 # Construct list 8688 ped_samples = list(ped.values()) 8689 8690 else: 8691 log.debug("Pedigree not defined. Take all samples") 8692 ped_samples = self.get_header_sample_list() 8693 ped = {} 8694 for ped_sample in ped_samples: 8695 ped[ped_sample] = ped_sample 8696 8697 # Check pedigree 8698 if not ped or len(ped) == 0: 8699 msg_error = f"Error in pedigree: samples {ped_samples}" 8700 log.error(msg_error) 8701 raise ValueError(msg_error) 8702 8703 # Log 8704 log.info( 8705 "Calculation 'BARCODEFAMILY' - Samples: " 8706 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8707 ) 8708 log.debug(f"ped_samples={ped_samples}") 8709 8710 # Field 8711 barcode_infos = prefix + tag 8712 8713 # Variants table 8714 table_variants = self.get_table_variants() 8715 8716 # Header 8717 vcf_reader = self.get_header() 8718 8719 # Create variant id 8720 variant_id_column = self.get_variant_id_column() 8721 added_columns = [variant_id_column] 8722 8723 # variant_id, FORMAT and samples 8724 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8725 ped_samples 8726 ) 8727 8728 # Create dataframe 8729 dataframe_barcode = self.get_query_to_df( 8730 f""" SELECT {samples_fields} FROM {table_variants} """ 8731 ) 8732 8733 # Create barcode column 8734 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8735 lambda row: barcode(row, samples=ped_samples), axis=1 8736 ) 8737 8738 # Add barcode family to header 8739 # Add vaf_normalization to header 8740 vcf_reader.formats[tag] = vcf.parser._Format( 8741 id=tag, 8742 num=".", 8743 type="String", 8744 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8745 type_code=self.code_type_map.get("String"), 8746 ) 8747 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8748 id=f"{tag}S", 8749 num=".", 8750 type="String", 8751 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8752 type_code=self.code_type_map.get("String"), 8753 ) 8754 8755 # Update 8756 # for sample in ped_samples: 8757 sql_update_set = [] 8758 for sample in self.get_header_sample_list() + ["FORMAT"]: 8759 if sample in ped_samples: 8760 value = f'dataframe_barcode."{barcode_infos}"' 8761 value_samples = "'" + ",".join(ped_samples) + "'" 8762 elif sample == "FORMAT": 8763 value = f"'{tag}'" 8764 value_samples = f"'{tag}S'" 8765 else: 8766 value = "'.'" 8767 value_samples = "'.'" 8768 format_regex = r"[a-zA-Z0-9\s]" 8769 sql_update_set.append( 8770 f""" 8771 "{sample}" = 8772 concat( 8773 CASE 8774 WHEN {table_variants}."{sample}" = './.' 8775 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8776 ELSE {table_variants}."{sample}" 8777 END, 8778 ':', 8779 {value}, 8780 ':', 8781 {value_samples} 8782 ) 8783 """ 8784 ) 8785 8786 sql_update_set_join = ", ".join(sql_update_set) 8787 sql_update = f""" 8788 UPDATE {table_variants} 8789 SET {sql_update_set_join} 8790 FROM dataframe_barcode 8791 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8792 """ 8793 self.conn.execute(sql_update) 8794 8795 # Remove added columns 8796 for added_column in added_columns: 8797 self.drop_column(column=added_column) 8798 8799 # Delete dataframe 8800 del dataframe_barcode 8801 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
8803 def calculation_trio(self) -> None: 8804 """ 8805 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8806 information to the INFO field of each variant. 8807 """ 8808 8809 # if FORMAT and samples 8810 if ( 8811 "FORMAT" in self.get_header_columns_as_list() 8812 and self.get_header_sample_list() 8813 ): 8814 8815 # trio annotation field 8816 trio_tag = "trio" 8817 8818 # VCF infos tags 8819 vcf_infos_tags = { 8820 "trio": "trio calculation", 8821 } 8822 8823 # Param 8824 param = self.get_param() 8825 8826 # Prefix 8827 prefix = self.get_explode_infos_prefix() 8828 8829 # Trio param 8830 trio_ped = ( 8831 param.get("calculation", {}) 8832 .get("calculations", {}) 8833 .get("TRIO", {}) 8834 .get("trio_pedigree", None) 8835 ) 8836 8837 # Load trio 8838 if trio_ped: 8839 8840 # Trio pedigree is a file 8841 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8842 log.debug("TRIO pedigree is file") 8843 with open(full_path(trio_ped)) as trio_ped: 8844 trio_ped = json.load(trio_ped) 8845 8846 # Trio pedigree is a string 8847 elif isinstance(trio_ped, str): 8848 log.debug("TRIO pedigree is str") 8849 try: 8850 trio_ped = json.loads(trio_ped) 8851 log.debug("TRIO pedigree is json str") 8852 except ValueError as e: 8853 trio_samples = trio_ped.split(",") 8854 if len(trio_samples) == 3: 8855 trio_ped = { 8856 "father": trio_samples[0], 8857 "mother": trio_samples[1], 8858 "child": trio_samples[2], 8859 } 8860 log.debug("TRIO pedigree is list str") 8861 else: 8862 msg_error = "TRIO pedigree not well formatted" 8863 log.error(msg_error) 8864 raise ValueError(msg_error) 8865 8866 # Trio pedigree is a dict 8867 elif isinstance(trio_ped, dict): 8868 log.debug("TRIO pedigree is dict") 8869 8870 # Trio pedigree is not well formatted 8871 else: 8872 msg_error = "TRIO pedigree not well formatted" 8873 log.error(msg_error) 8874 raise ValueError(msg_error) 8875 8876 # Construct trio list 8877 trio_samples = [ 8878 trio_ped.get("father", ""), 8879 trio_ped.get("mother", ""), 8880 trio_ped.get("child", ""), 8881 ] 8882 8883 else: 8884 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8885 samples_list = self.get_header_sample_list() 8886 if len(samples_list) >= 3: 8887 trio_samples = self.get_header_sample_list()[0:3] 8888 trio_ped = { 8889 "father": trio_samples[0], 8890 "mother": trio_samples[1], 8891 "child": trio_samples[2], 8892 } 8893 else: 8894 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8895 log.error(msg_error) 8896 raise ValueError(msg_error) 8897 8898 # Check trio pedigree 8899 if not trio_ped or len(trio_ped) != 3: 8900 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8901 log.error(msg_error) 8902 raise ValueError(msg_error) 8903 8904 # Log 8905 log.info( 8906 f"Calculation 'TRIO' - Samples: " 8907 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8908 ) 8909 8910 # Field 8911 trio_infos = prefix + trio_tag 8912 8913 # Variants table 8914 table_variants = self.get_table_variants() 8915 8916 # Header 8917 vcf_reader = self.get_header() 8918 8919 # Create variant id 8920 variant_id_column = self.get_variant_id_column() 8921 added_columns = [variant_id_column] 8922 8923 # variant_id, FORMAT and samples 8924 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8925 self.get_header_sample_list() 8926 ) 8927 8928 # Create dataframe 8929 dataframe_trio = self.get_query_to_df( 8930 f""" SELECT {samples_fields} FROM {table_variants} """ 8931 ) 8932 8933 # Create trio column 8934 dataframe_trio[trio_infos] = dataframe_trio.apply( 8935 lambda row: trio(row, samples=trio_samples), axis=1 8936 ) 8937 8938 # Add trio to header 8939 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8940 trio_tag, 8941 ".", 8942 "String", 8943 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8944 "howard calculation", 8945 "0", 8946 self.code_type_map.get("String"), 8947 ) 8948 8949 # Update 8950 sql_update = f""" 8951 UPDATE {table_variants} 8952 SET "INFO" = 8953 concat( 8954 CASE 8955 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8956 THEN '' 8957 ELSE concat("INFO", ';') 8958 END, 8959 CASE 8960 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8961 AND dataframe_trio."{trio_infos}" NOT NULL 8962 THEN concat( 8963 '{trio_tag}=', 8964 dataframe_trio."{trio_infos}" 8965 ) 8966 ELSE '' 8967 END 8968 ) 8969 FROM dataframe_trio 8970 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8971 """ 8972 self.conn.execute(sql_update) 8973 8974 # Remove added columns 8975 for added_column in added_columns: 8976 self.drop_column(column=added_column) 8977 8978 # Delete dataframe 8979 del dataframe_trio 8980 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
8982 def calculation_vaf_normalization(self) -> None: 8983 """ 8984 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8985 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8986 :return: The function does not return anything. 8987 """ 8988 8989 # if FORMAT and samples 8990 if ( 8991 "FORMAT" in self.get_header_columns_as_list() 8992 and self.get_header_sample_list() 8993 ): 8994 8995 # vaf_normalization annotation field 8996 vaf_normalization_tag = "VAF" 8997 8998 # VCF infos tags 8999 vcf_infos_tags = { 9000 "VAF": "VAF Variant Frequency", 9001 } 9002 9003 # Prefix 9004 prefix = self.get_explode_infos_prefix() 9005 9006 # Variants table 9007 table_variants = self.get_table_variants() 9008 9009 # Header 9010 vcf_reader = self.get_header() 9011 9012 # Do not calculate if VAF already exists 9013 if "VAF" in vcf_reader.formats: 9014 log.debug("VAF already on genotypes") 9015 return 9016 9017 # Create variant id 9018 variant_id_column = self.get_variant_id_column() 9019 added_columns = [variant_id_column] 9020 9021 # variant_id, FORMAT and samples 9022 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9023 f""" "{sample}" """ for sample in self.get_header_sample_list() 9024 ) 9025 9026 # Create dataframe 9027 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9028 log.debug(f"query={query}") 9029 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9030 9031 vaf_normalization_set = [] 9032 9033 # for each sample vaf_normalization 9034 for sample in self.get_header_sample_list(): 9035 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9036 lambda row: vaf_normalization(row, sample=sample), axis=1 9037 ) 9038 vaf_normalization_set.append( 9039 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9040 ) 9041 9042 # Add VAF to FORMAT 9043 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9044 "FORMAT" 9045 ].apply(lambda x: str(x) + ":VAF") 9046 vaf_normalization_set.append( 9047 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9048 ) 9049 9050 # Add vaf_normalization to header 9051 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9052 id=vaf_normalization_tag, 9053 num="1", 9054 type="Float", 9055 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9056 type_code=self.code_type_map.get("Float"), 9057 ) 9058 9059 # Create fields to add in INFO 9060 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9061 9062 # Update 9063 sql_update = f""" 9064 UPDATE {table_variants} 9065 SET {sql_vaf_normalization_set} 9066 FROM dataframe_vaf_normalization 9067 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9068 9069 """ 9070 self.conn.execute(sql_update) 9071 9072 # Remove added columns 9073 for added_column in added_columns: 9074 self.drop_column(column=added_column) 9075 9076 # Delete dataframe 9077 del dataframe_vaf_normalization 9078 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9080 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9081 """ 9082 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9083 field in a VCF file and updates the INFO column of the variants table with the calculated 9084 statistics. 9085 9086 :param info: The `info` parameter is a string that represents the type of information for which 9087 genotype statistics are calculated. It is used to generate various VCF info tags for the 9088 statistics, such as the number of occurrences, the list of values, the minimum value, the 9089 maximum value, the mean, the median, defaults to VAF 9090 :type info: str (optional) 9091 """ 9092 9093 # if FORMAT and samples 9094 if ( 9095 "FORMAT" in self.get_header_columns_as_list() 9096 and self.get_header_sample_list() 9097 ): 9098 9099 # vaf_stats annotation field 9100 vaf_stats_tag = info + "_stats" 9101 9102 # VCF infos tags 9103 vcf_infos_tags = { 9104 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9105 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9106 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9107 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9108 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9109 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9110 info 9111 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9112 } 9113 9114 # Prefix 9115 prefix = self.get_explode_infos_prefix() 9116 9117 # Field 9118 vaf_stats_infos = prefix + vaf_stats_tag 9119 9120 # Variants table 9121 table_variants = self.get_table_variants() 9122 9123 # Header 9124 vcf_reader = self.get_header() 9125 9126 # Create variant id 9127 variant_id_column = self.get_variant_id_column() 9128 added_columns = [variant_id_column] 9129 9130 # variant_id, FORMAT and samples 9131 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9132 self.get_header_sample_list() 9133 ) 9134 9135 # Create dataframe 9136 dataframe_vaf_stats = self.get_query_to_df( 9137 f""" SELECT {samples_fields} FROM {table_variants} """ 9138 ) 9139 9140 # Create vaf_stats column 9141 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9142 lambda row: genotype_stats( 9143 row, samples=self.get_header_sample_list(), info=info 9144 ), 9145 axis=1, 9146 ) 9147 9148 # List of vcf tags 9149 sql_vaf_stats_fields = [] 9150 9151 # Check all VAF stats infos 9152 for stat in vcf_infos_tags: 9153 9154 # Extract stats 9155 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9156 lambda x: dict(x).get(stat, "") 9157 ) 9158 9159 # Add snpeff_hgvs to header 9160 vcf_reader.infos[stat] = vcf.parser._Info( 9161 stat, 9162 ".", 9163 "String", 9164 vcf_infos_tags.get(stat, "genotype statistics"), 9165 "howard calculation", 9166 "0", 9167 self.code_type_map.get("String"), 9168 ) 9169 9170 if len(sql_vaf_stats_fields): 9171 sep = ";" 9172 else: 9173 sep = "" 9174 9175 # Create fields to add in INFO 9176 sql_vaf_stats_fields.append( 9177 f""" 9178 CASE 9179 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9180 THEN concat( 9181 '{sep}{stat}=', 9182 dataframe_vaf_stats."{stat}" 9183 ) 9184 ELSE '' 9185 END 9186 """ 9187 ) 9188 9189 # SQL set for update 9190 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9191 9192 # Update 9193 sql_update = f""" 9194 UPDATE {table_variants} 9195 SET "INFO" = 9196 concat( 9197 CASE 9198 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9199 THEN '' 9200 ELSE concat("INFO", ';') 9201 END, 9202 {sql_vaf_stats_fields_set} 9203 ) 9204 FROM dataframe_vaf_stats 9205 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9206 9207 """ 9208 self.conn.execute(sql_update) 9209 9210 # Remove added columns 9211 for added_column in added_columns: 9212 self.drop_column(column=added_column) 9213 9214 # Delete dataframe 9215 del dataframe_vaf_stats 9216 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9218 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9219 """ 9220 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9221 to it if transcripts are available. 9222 9223 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9224 parameter that specifies the information field to be used in the transcripts JSON. It has a 9225 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9226 transcripts_json 9227 :type info: str (optional) 9228 """ 9229 9230 # Create transcripts table 9231 transcripts_table = self.create_transcript_view() 9232 9233 # Add info field 9234 if transcripts_table: 9235 self.transcript_view_to_variants( 9236 transcripts_table=transcripts_table, transcripts_info_field=info 9237 ) 9238 else: 9239 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_json creates a transcripts table and adds an info field
to it if transcripts are available.
Parameters
- info: The
infoparameter in thecalculation_transcripts_jsonmethod is a string parameter that specifies the information field to be used in the transcripts JSON. It has a default value of "transcripts_json" if no value is provided when calling the method, defaults to transcripts_json
9245 def create_transcript_view_from_columns_map( 9246 self, 9247 transcripts_table: str = "transcripts", 9248 columns_maps: dict = {}, 9249 added_columns: list = [], 9250 temporary_tables: list = None, 9251 annotation_fields: list = None, 9252 ) -> tuple[list, list, list]: 9253 """ 9254 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9255 specified columns mapping for transcripts data. 9256 9257 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9258 the table where the transcripts data is stored or will be stored in the database. This table 9259 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9260 predictions, etc. It defaults to "transcripts, defaults to transcripts 9261 :type transcripts_table: str (optional) 9262 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9263 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9264 represents a mapping configuration for a specific set of columns. It typically includes details such 9265 as the main transcript column and additional information columns 9266 :type columns_maps: dict 9267 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9268 function is a list that stores the additional columns that will be added to the view being created 9269 based on the columns map provided. These columns are generated by exploding the transcript 9270 information columns along with the main transcript column 9271 :type added_columns: list 9272 :param temporary_tables: The `temporary_tables` parameter in the 9273 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9274 tables created during the process of creating a transcript view from a columns map. These temporary 9275 tables are used to store intermediate results or transformations before the final view is generated 9276 :type temporary_tables: list 9277 :param annotation_fields: The `annotation_fields` parameter in the 9278 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9279 for annotation in the query view creation process. These fields are extracted from the 9280 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9281 :type annotation_fields: list 9282 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9283 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9284 """ 9285 9286 log.debug("Start transcrpts view creation from columns map...") 9287 9288 # "from_columns_map": [ 9289 # { 9290 # "transcripts_column": "Ensembl_transcriptid", 9291 # "transcripts_infos_columns": [ 9292 # "genename", 9293 # "Ensembl_geneid", 9294 # "LIST_S2_score", 9295 # "LIST_S2_pred", 9296 # ], 9297 # }, 9298 # { 9299 # "transcripts_column": "Ensembl_transcriptid", 9300 # "transcripts_infos_columns": [ 9301 # "genename", 9302 # "VARITY_R_score", 9303 # "Aloft_pred", 9304 # ], 9305 # }, 9306 # ], 9307 9308 # Init 9309 if temporary_tables is None: 9310 temporary_tables = [] 9311 if annotation_fields is None: 9312 annotation_fields = [] 9313 9314 # Variants table 9315 table_variants = self.get_table_variants() 9316 9317 for columns_map in columns_maps: 9318 9319 # Transcript column 9320 transcripts_column = columns_map.get("transcripts_column", None) 9321 9322 # Transcripts infos columns 9323 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9324 9325 if transcripts_column is not None: 9326 9327 # Explode 9328 added_columns += self.explode_infos( 9329 fields=[transcripts_column] + transcripts_infos_columns 9330 ) 9331 9332 # View clauses 9333 clause_select = [] 9334 for field in [transcripts_column] + transcripts_infos_columns: 9335 clause_select.append( 9336 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9337 ) 9338 if field not in [transcripts_column]: 9339 annotation_fields.append(field) 9340 9341 # Querey View 9342 query = f""" 9343 SELECT 9344 "#CHROM", POS, REF, ALT, 9345 "{transcripts_column}" AS 'transcript', 9346 {", ".join(clause_select)} 9347 FROM ( 9348 SELECT 9349 "#CHROM", POS, REF, ALT, 9350 {", ".join(clause_select)} 9351 FROM {table_variants} 9352 ) 9353 WHERE "{transcripts_column}" IS NOT NULL 9354 """ 9355 9356 # Create temporary table 9357 temporary_table = transcripts_table + "".join( 9358 random.choices(string.ascii_uppercase + string.digits, k=10) 9359 ) 9360 9361 # Temporary_tables 9362 temporary_tables.append(temporary_table) 9363 query_view = f""" 9364 CREATE TEMPORARY TABLE {temporary_table} 9365 AS ({query}) 9366 """ 9367 self.execute_query(query=query_view) 9368 9369 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9371 def create_transcript_view_from_column_format( 9372 self, 9373 transcripts_table: str = "transcripts", 9374 column_formats: dict = {}, 9375 temporary_tables: list = None, 9376 annotation_fields: list = None, 9377 ) -> tuple[list, list, list]: 9378 """ 9379 The `create_transcript_view_from_column_format` function generates a transcript view based on 9380 specified column formats, adds additional columns and annotation fields, and returns the list of 9381 temporary tables and annotation fields. 9382 9383 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9384 the table containing the transcripts data. This table will be used as the base table for creating 9385 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9386 different table name if needed, defaults to transcripts 9387 :type transcripts_table: str (optional) 9388 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9389 about the columns to be used for creating the transcript view. Each entry in the dictionary 9390 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9391 the provided code snippet: 9392 :type column_formats: dict 9393 :param temporary_tables: The `temporary_tables` parameter in the 9394 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9395 views created during the process of creating a transcript view from a column format. These temporary 9396 views are used to manipulate and extract data before generating the final transcript view. It 9397 :type temporary_tables: list 9398 :param annotation_fields: The `annotation_fields` parameter in the 9399 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9400 that are extracted from the temporary views created during the process. These annotation fields are 9401 obtained by querying the temporary views and extracting the column names excluding specific columns 9402 like `#CH 9403 :type annotation_fields: list 9404 :return: The `create_transcript_view_from_column_format` function returns two lists: 9405 `temporary_tables` and `annotation_fields`. 9406 """ 9407 9408 log.debug("Start transcrpts view creation from column format...") 9409 9410 # "from_column_format": [ 9411 # { 9412 # "transcripts_column": "ANN", 9413 # "transcripts_infos_column": "Feature_ID", 9414 # } 9415 # ], 9416 9417 # Init 9418 if temporary_tables is None: 9419 temporary_tables = [] 9420 if annotation_fields is None: 9421 annotation_fields = [] 9422 9423 for column_format in column_formats: 9424 9425 # annotation field and transcript annotation field 9426 annotation_field = column_format.get("transcripts_column", "ANN") 9427 transcript_annotation = column_format.get( 9428 "transcripts_infos_column", "Feature_ID" 9429 ) 9430 9431 # Temporary View name 9432 temporary_view_name = transcripts_table + "".join( 9433 random.choices(string.ascii_uppercase + string.digits, k=10) 9434 ) 9435 9436 # Create temporary view name 9437 temporary_view_name = self.annotation_format_to_table( 9438 uniquify=True, 9439 annotation_field=annotation_field, 9440 view_name=temporary_view_name, 9441 annotation_id=transcript_annotation, 9442 ) 9443 9444 # Annotation fields 9445 if temporary_view_name: 9446 query_annotation_fields = f""" 9447 SELECT * 9448 FROM ( 9449 DESCRIBE SELECT * 9450 FROM {temporary_view_name} 9451 ) 9452 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9453 """ 9454 df_annotation_fields = self.get_query_to_df( 9455 query=query_annotation_fields 9456 ) 9457 9458 # Add temporary view and annotation fields 9459 temporary_tables.append(temporary_view_name) 9460 annotation_fields += list(set(df_annotation_fields["column_name"])) 9461 9462 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9464 def create_transcript_view( 9465 self, 9466 transcripts_table: str = None, 9467 transcripts_table_drop: bool = True, 9468 param: dict = {}, 9469 ) -> str: 9470 """ 9471 The `create_transcript_view` function generates a transcript view by processing data from a 9472 specified table based on provided parameters and structural information. 9473 9474 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9475 is used to specify the name of the table that will store the final transcript view data. If a table 9476 name is not provided, the function will create a new table to store the transcript view data, and by 9477 default,, defaults to transcripts 9478 :type transcripts_table: str (optional) 9479 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9480 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9481 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9482 the function will drop the existing transcripts table if it exists, defaults to True 9483 :type transcripts_table_drop: bool (optional) 9484 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9485 contains information needed to create a transcript view. It includes details such as the structure 9486 of the transcripts, columns mapping, column formats, and other necessary information for generating 9487 the view. This parameter allows for flexibility and customization 9488 :type param: dict 9489 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9490 created or modified during the execution of the function. 9491 """ 9492 9493 log.debug("Start transcrpts view creation...") 9494 9495 # Default 9496 transcripts_table_default = "transcripts" 9497 9498 # Param 9499 if not param: 9500 param = self.get_param() 9501 9502 # Struct 9503 struct = param.get("transcripts", {}).get("struct", None) 9504 9505 if struct: 9506 9507 # Transcripts table 9508 if transcripts_table is None: 9509 transcripts_table = param.get("transcripts", {}).get( 9510 "table", transcripts_table_default 9511 ) 9512 9513 # added_columns 9514 added_columns = [] 9515 9516 # Temporary tables 9517 temporary_tables = [] 9518 9519 # Annotation fields 9520 annotation_fields = [] 9521 9522 # from columns map 9523 columns_maps = struct.get("from_columns_map", []) 9524 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9525 self.create_transcript_view_from_columns_map( 9526 transcripts_table=transcripts_table, 9527 columns_maps=columns_maps, 9528 added_columns=added_columns, 9529 temporary_tables=temporary_tables, 9530 annotation_fields=annotation_fields, 9531 ) 9532 ) 9533 added_columns += added_columns_tmp 9534 temporary_tables += temporary_tables_tmp 9535 annotation_fields += annotation_fields_tmp 9536 9537 # from column format 9538 column_formats = struct.get("from_column_format", []) 9539 temporary_tables_tmp, annotation_fields_tmp = ( 9540 self.create_transcript_view_from_column_format( 9541 transcripts_table=transcripts_table, 9542 column_formats=column_formats, 9543 temporary_tables=temporary_tables, 9544 annotation_fields=annotation_fields, 9545 ) 9546 ) 9547 temporary_tables += temporary_tables_tmp 9548 annotation_fields += annotation_fields_tmp 9549 9550 # Merge temporary tables query 9551 query_merge = "" 9552 for temporary_table in temporary_tables: 9553 9554 # First temporary table 9555 if not query_merge: 9556 query_merge = f""" 9557 SELECT * FROM {temporary_table} 9558 """ 9559 # other temporary table (using UNION) 9560 else: 9561 query_merge += f""" 9562 UNION BY NAME SELECT * FROM {temporary_table} 9563 """ 9564 9565 # Merge on transcript 9566 query_merge_on_transcripts_annotation_fields = [] 9567 # Aggregate all annotations fields 9568 for annotation_field in set(annotation_fields): 9569 query_merge_on_transcripts_annotation_fields.append( 9570 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9571 ) 9572 # Query for transcripts view 9573 query_merge_on_transcripts = f""" 9574 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9575 FROM ({query_merge}) 9576 GROUP BY "#CHROM", POS, REF, ALT, transcript 9577 """ 9578 9579 # Drop transcript view is necessary 9580 if transcripts_table_drop: 9581 query_drop = f""" 9582 DROP TABLE IF EXISTS {transcripts_table}; 9583 """ 9584 self.execute_query(query=query_drop) 9585 9586 # Merge and create transcript view 9587 query_create_view = f""" 9588 CREATE TABLE IF NOT EXISTS {transcripts_table} 9589 AS {query_merge_on_transcripts} 9590 """ 9591 self.execute_query(query=query_create_view) 9592 9593 # Remove added columns 9594 for added_column in added_columns: 9595 self.drop_column(column=added_column) 9596 9597 else: 9598 9599 transcripts_table = None 9600 9601 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
9603 def annotation_format_to_table( 9604 self, 9605 uniquify: bool = True, 9606 annotation_field: str = "ANN", 9607 annotation_id: str = "Feature_ID", 9608 view_name: str = "transcripts", 9609 ) -> str: 9610 """ 9611 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9612 table format. 9613 9614 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9615 values in the output or not. If set to `True`, the function will make sure that the output values 9616 are unique, defaults to True 9617 :type uniquify: bool (optional) 9618 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9619 contains the annotation information for each variant. This field is used to extract the annotation 9620 details for further processing in the function, defaults to ANN 9621 :type annotation_field: str (optional) 9622 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9623 used to specify the identifier for the annotation feature. This identifier will be used as a column 9624 name in the resulting table or view that is created based on the annotation data. It helps in 9625 uniquely identifying each annotation entry in the, defaults to Feature_ID 9626 :type annotation_id: str (optional) 9627 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9628 specify the name of the temporary table that will be created to store the transformed annotation 9629 data. This table will hold the extracted information from the annotation field in a structured 9630 format for further processing or analysis, defaults to transcripts 9631 :type view_name: str (optional) 9632 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9633 is stored in the variable `view_name`. 9634 """ 9635 9636 # Annotation field 9637 annotation_format = "annotation_explode" 9638 9639 # Transcript annotation 9640 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9641 9642 # Prefix 9643 prefix = self.get_explode_infos_prefix() 9644 if prefix: 9645 prefix = "INFO/" 9646 9647 # Annotation fields 9648 annotation_infos = prefix + annotation_field 9649 annotation_format_infos = prefix + annotation_format 9650 9651 # Variants table 9652 table_variants = self.get_table_variants() 9653 9654 # Header 9655 vcf_reader = self.get_header() 9656 9657 # Add columns 9658 added_columns = [] 9659 9660 # Explode HGVS field in column 9661 added_columns += self.explode_infos(fields=[annotation_field]) 9662 9663 if annotation_field in vcf_reader.infos: 9664 9665 # Extract ANN header 9666 ann_description = vcf_reader.infos[annotation_field].desc 9667 pattern = r"'(.+?)'" 9668 match = re.search(pattern, ann_description) 9669 if match: 9670 ann_header_match = match.group(1).split(" | ") 9671 ann_header = [] 9672 ann_header_desc = {} 9673 for i in range(len(ann_header_match)): 9674 ann_header_info = "".join( 9675 char for char in ann_header_match[i] if char.isalnum() 9676 ) 9677 ann_header.append(ann_header_info) 9678 ann_header_desc[ann_header_info] = ann_header_match[i] 9679 if not ann_header_desc: 9680 raise ValueError("Invalid header description format") 9681 else: 9682 raise ValueError("Invalid header description format") 9683 9684 # Create variant id 9685 variant_id_column = self.get_variant_id_column() 9686 added_columns += [variant_id_column] 9687 9688 # Create dataframe 9689 dataframe_annotation_format = self.get_query_to_df( 9690 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9691 ) 9692 9693 # Create annotation columns 9694 dataframe_annotation_format[ 9695 annotation_format_infos 9696 ] = dataframe_annotation_format[annotation_infos].apply( 9697 lambda x: explode_annotation_format( 9698 annotation=str(x), 9699 uniquify=uniquify, 9700 output_format="JSON", 9701 prefix="", 9702 header=list(ann_header_desc.values()), 9703 ) 9704 ) 9705 9706 # Find keys 9707 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9708 df_keys = self.get_query_to_df(query=query_json) 9709 9710 # Check keys 9711 query_json_key = [] 9712 for _, row in df_keys.iterrows(): 9713 9714 # Key 9715 key = row.iloc[0] 9716 9717 # key_clean 9718 key_clean = "".join(char for char in key if char.isalnum()) 9719 9720 # Type 9721 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9722 9723 # Get DataFrame from query 9724 df_json_type = self.get_query_to_df(query=query_json_type) 9725 9726 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9727 with pd.option_context("future.no_silent_downcasting", True): 9728 df_json_type.fillna(value="", inplace=True) 9729 replace_dict = {None: np.nan, "": np.nan} 9730 df_json_type.replace(replace_dict, inplace=True) 9731 df_json_type.dropna(inplace=True) 9732 9733 # Detect column type 9734 column_type = detect_column_type(df_json_type[key_clean]) 9735 9736 # Append 9737 query_json_key.append( 9738 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9739 ) 9740 9741 # Create view 9742 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9743 self.execute_query(query=query_view) 9744 9745 else: 9746 9747 # Return None 9748 view_name = None 9749 9750 # Remove added columns 9751 for added_column in added_columns: 9752 self.drop_column(column=added_column) 9753 9754 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
9756 def transcript_view_to_variants( 9757 self, 9758 transcripts_table: str = None, 9759 transcripts_column_id: str = None, 9760 transcripts_info_json: str = None, 9761 transcripts_info_field: str = None, 9762 param: dict = {}, 9763 ) -> bool: 9764 """ 9765 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9766 a variants table with information from the transcripts in JSON format. 9767 9768 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 9769 containing the transcripts data. If this parameter is not provided, the function will attempt to 9770 retrieve it from the `param` dictionary or use a default value of "transcripts" 9771 :type transcripts_table: str 9772 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 9773 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 9774 used to match transcripts with variants in the database 9775 :type transcripts_column_id: str 9776 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 9777 the column in the variants table where the transcripts information will be stored in JSON format 9778 :type transcripts_info_json: str 9779 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 9780 in the VCF header that will contain information about transcripts in JSON format. This field will be 9781 added to the VCF header as an INFO field with the specified name 9782 :type transcripts_info_field: str 9783 :param param: The `transcript_view_to_variants` method takes several parameters: 9784 :type param: dict 9785 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 9786 operation is successful and `False` if certain conditions are not met. 9787 """ 9788 9789 log.debug("Start transcripts view to JSON...") 9790 9791 # Default 9792 transcripts_table_default = "transcripts" 9793 transcripts_column_id_default = "transcript" 9794 transcripts_info_json_default = None 9795 transcripts_info_field_default = None 9796 9797 # Param 9798 if not param: 9799 param = self.get_param() 9800 9801 # Transcripts table 9802 if transcripts_table is None: 9803 transcripts_table = param.get("transcripts", {}).get( 9804 "table", transcripts_table_default 9805 ) 9806 9807 # Transcripts column ID 9808 if transcripts_column_id is None: 9809 transcripts_column_id = param.get("transcripts", {}).get( 9810 "column_id", transcripts_column_id_default 9811 ) 9812 9813 # Transcripts info field 9814 if transcripts_info_json is None: 9815 transcripts_info_json = param.get("transcripts", {}).get( 9816 "transcripts_info_json", transcripts_info_json_default 9817 ) 9818 9819 # Transcripts info field 9820 if transcripts_info_field is None: 9821 transcripts_info_field = param.get("transcripts", {}).get( 9822 "transcripts_info_field", transcripts_info_field_default 9823 ) 9824 9825 # Variants table 9826 table_variants = self.get_table_variants() 9827 9828 # Check info columns param 9829 if transcripts_info_json is None and transcripts_info_field is None: 9830 return False 9831 9832 # Transcripts infos columns 9833 query_transcripts_infos_columns = f""" 9834 SELECT * 9835 FROM ( 9836 DESCRIBE SELECT * FROM {transcripts_table} 9837 ) 9838 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 9839 """ 9840 transcripts_infos_columns = list( 9841 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 9842 ) 9843 9844 # View results 9845 clause_select = [] 9846 clause_to_json = [] 9847 for field in transcripts_infos_columns: 9848 clause_select.append( 9849 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9850 ) 9851 clause_to_json.append(f""" '{field}': "{field}" """) 9852 9853 # Update 9854 update_set = [] 9855 9856 # VCF header 9857 vcf_reader = self.get_header() 9858 9859 # Transcripts to info column in JSON 9860 if transcripts_info_json is not None: 9861 9862 # Create column on variants table 9863 self.add_column( 9864 table_name=table_variants, 9865 column_name=transcripts_info_json, 9866 column_type="JSON", 9867 default_value=None, 9868 drop=False, 9869 ) 9870 9871 # Add to update 9872 update_set.append( 9873 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 9874 ) 9875 9876 # Add header 9877 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 9878 transcripts_info_json, 9879 ".", 9880 "String", 9881 "Transcripts in JSON format", 9882 "unknwon", 9883 "unknwon", 9884 self.code_type_map["String"], 9885 ) 9886 9887 # Transcripts to info field in JSON 9888 if transcripts_info_field is not None: 9889 9890 # Add to update 9891 update_set.append( 9892 f""" 9893 INFO = concat( 9894 CASE 9895 WHEN INFO NOT IN ('', '.') 9896 THEN INFO 9897 ELSE '' 9898 END, 9899 CASE 9900 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 9901 THEN concat( 9902 ';{transcripts_info_field}=', 9903 t.{transcripts_info_json} 9904 ) 9905 ELSE '' 9906 END 9907 ) 9908 """ 9909 ) 9910 9911 # Add header 9912 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 9913 transcripts_info_field, 9914 ".", 9915 "String", 9916 "Transcripts in JSON format", 9917 "unknwon", 9918 "unknwon", 9919 self.code_type_map["String"], 9920 ) 9921 9922 # Update query 9923 query_update = f""" 9924 UPDATE {table_variants} 9925 SET {", ".join(update_set)} 9926 FROM 9927 ( 9928 SELECT 9929 "#CHROM", POS, REF, ALT, 9930 concat( 9931 '{{', 9932 string_agg( 9933 '"' || "{transcripts_column_id}" || '":' || 9934 to_json(json_output) 9935 ), 9936 '}}' 9937 )::JSON AS {transcripts_info_json} 9938 FROM 9939 ( 9940 SELECT 9941 "#CHROM", POS, REF, ALT, 9942 "{transcripts_column_id}", 9943 to_json( 9944 {{{",".join(clause_to_json)}}} 9945 )::JSON AS json_output 9946 FROM 9947 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 9948 WHERE "{transcripts_column_id}" IS NOT NULL 9949 ) 9950 GROUP BY "#CHROM", POS, REF, ALT 9951 ) AS t 9952 WHERE {table_variants}."#CHROM" = t."#CHROM" 9953 AND {table_variants}."POS" = t."POS" 9954 AND {table_variants}."REF" = t."REF" 9955 AND {table_variants}."ALT" = t."ALT" 9956 """ 9957 9958 self.execute_query(query=query_update) 9959 9960 return True
The function transcript_view_to_variants takes input parameters related to transcripts and updates
a variants table with information from the transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format - transcripts_info_field: The
transcripts_info_fieldparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - param: The
transcript_view_to_variantsmethod takes several parameters:
Returns
The function
transcript_view_to_variantsreturns a boolean value, which isTrueif the operation is successful andFalseif certain conditions are not met.